In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# opening csv file
df_csv = pd.read_csv('Major_Crime_Indicators_Open_Data.csv')
# opening excel file
# df_excel = pd.read_excel('Major_Crime_Indicators_Open_Data.xlsx')

# finding nan values
df_csv.isna().sum()


# finding outliers
def detectOutliers (dataFrame):
    
    # Convert each column to numeric, handling errors and converting non-numeric values to NaN
    dataFrame = dataFrame.apply(pd.to_numeric, errors='coerce')
    
    Q1 = dataFrame.quantile(0.25)
    Q3 = dataFrame.quantile(0.75)
    IQR = Q3 - Q1 
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (dataFrame < lower_bound) | (dataFrame > upper_bound)
    return outliers

detectOutliers(df_csv).sum()

# retrieve all unique values in a column
df_csv['OCC_YEAR'].unique()

# dropping columns
df_csv = df_csv.drop(['OCC_DOW', 'OCC_DOY'], axis = 1)




### Handeling nan values
Handling NaN (Not a Number) values is an important aspect of data preprocessing. There are several common strategies to deal with NaN values in a DataFrame:

    1. Removing Rows or Columns:
    dropna(): Removes rows or columns containing NaN values.
    df.dropna()  # removes rows with NaN values
    df.dropna(axis=1)  # removes columns with NaN values
    
    2. Filling NaN values:
    fillna(): Fills NaN values with a specified value or using various filling methods.
    df.fillna(value)  # fills NaN with a specific value
    df.ffill()  # fills NaN values with the previous non-NaN value (forward fill)
    df.bfill()  # fills NaN values with the next non-NaN value (backward fill)
    
    3. Imputation: Impute NaN values with statistical measures like mean, median, or mode.
    df['column_name'].fillna(df['column_name'].mean(), inplace=True)
    
    4. Interpolation: Use interpolation methods to estimate NaN values based on existing data.
    df.interpolate(method='linear', inplace=True)
    
    5. Masking: Create a Boolean mask to identify and handle NaN values selectively.
    df[df.isna()] = value  # replace NaN with a specific value


In [7]:
# handleing nan values
df_csv = df_csv.ffill()
# finding nan values
df_csv.isna().sum()

# showing correlation for numerical columns
num_cols = df_csv.select_dtypes(include = [np.number]).columns
corrleation = df_csv[num_cols].corr()
sns.heatmap(corrleation, cmap = 'coolwarm')

# showing a barchart wtih matplotlib
offence_count = df_csv['OFFENCE'].value_counts()

plt.bar(offence_count.index, offence_count)
plt.xlabel('Offence Types')
plt.ylabel('Count')
plt.title('Bar Chart of Offence Types')
plt.show()