In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# CrimeCast: Forecasting Crime Categories

# *1. Imported Required Libraries*

In [None]:
import numpy as np
from numpy import nan
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import learning_curve

from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score

from sklearn.dummy import DummyClassifier

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.svm import LinearSVC

from sklearn.neural_network import MLPClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.utils.multiclass import type_of_target

In [None]:
# Seed will ensure that the sequence of random numbers generated by NumPy will be the same each time when we run the code.

np.random.seed(306)

In [None]:
# ShuffleSplit is a type of cross-validation technique, where the data is splitted in k folds, each split is created
# by shuffling the data and then splitting it.

# cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=42)

In [None]:
import warnings
warnings.filterwarnings("ignore")

# 2. *EDA*

## a) Loading the dataset

In [None]:
data=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
test_data=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

In [None]:
pd.set_option("display.max_columns",data.shape[1])

## Inspecting the Data

In [None]:
data.head()

In [None]:
test_data.head()

## Shape of training data

In [None]:
data.shape

In [None]:
test_data.shape

## Details of features

In [None]:
data.info()

In [None]:
test_data.info()

## b) Summary Statistics

In [None]:
data.describe()

In [None]:
data.describe(include=object)

In [None]:
test_data.describe()

In [None]:
test_data.describe(include='object')

## *Insights*
## *It helped in identifying high values (outliers) or missing data patterns.*
## *Numerical features have varying scales, so it need to be scaled.*

## c) Identify Missing Values

In [None]:
data.isnull().sum()

In [None]:
test_data.isnull().sum()

## *Insights*
## *Here some columns have missing values, which need to be addressed before modeling.*

## d) Visualize Data Distributions

## Name of the Numerical Features

In [None]:
num_features=data.select_dtypes(include=['float64', 'int64']).columns
print(num_features)

## Plotted histograms for numerical features

In [None]:
data.describe()

In [None]:
data[data['Latitude']==0].shape[0]

In [None]:
data['Latitude'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Latitude')
plt.ylabel('Frequency')
plt.title('Distribution of Latitude Values')

## *Insights*
## *The Latitude has a minimum value of 0.000000 and a maximum value of 34.328100 which means there is a potential outliers at the lower end.*

In [None]:
data[data['Longitude']==0].shape[0]

In [None]:
data['Longitude'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Longitude')
plt.ylabel('Frequency')
plt.title('Distribution of Longitude Values')

## *Insights*
## *The Longitude also has a minimum value of -118.663400 and a maximum value of 0.000000, which is unusual for longitude values which indicates that outliers are present.*

In [None]:
data['Time_Occurred'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Time_Occurred')
plt.ylabel('Frequency')
plt.title('Distribution of Time_Occurred Values')

In [None]:
data['Area_ID'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Area_ID')
plt.ylabel('Frequency')
plt.title('Distribution of Area_ID Values')

In [None]:
data['Reporting_District_no'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Reporting_District_no')
plt.ylabel('Frequency')
plt.title('Distribution of Reporting_District_no Values')

In [None]:
data['Part 1-2'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Part 1-2')
plt.ylabel('Frequency')
plt.title('Distribution of Part 1-2 Values')

In [None]:
data['Victim_Age'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Victim_Age')
plt.ylabel('Frequency')
plt.title('Distribution of Victim_Age Values')

## *Insights*
## *Victim_Age has values ranging from -2.0 to 99.0, which seems unrealistic because of the negative age.*

In [None]:
data['Premise_Code'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Premise_Code')
plt.ylabel('Frequency')
plt.title('Distribution of Premise_Code Values')

In [None]:
data['Weapon_Used_Code'].plot(kind='hist', bins=10, edgecolor='black')
plt.xlabel('Weapon_Used_Code')
plt.ylabel('Frequency')
plt.title('Distribution of Weapon_Used_Code Values')

## *Insights*
## *Visualized the distribution of numerical features.*
## *It Helped in understanding the range and frequency of data points for each numerical feature.*
## *Helped in understanding the data's shape, identifying outliers, comparing distributions.*
## *It provides a clear visual representation, also facilitate better data analysis and decision making.*

## e) Outlier Detection

In [None]:
# Plot box plot for numerical features to detect outliers

plt.figure(figsize=(10, 5))
data.boxplot()
plt.xticks(rotation=45)

## *Insights*

## *The box represents the IQR.*
## *The line inside the box represents the median.*
## *Any points outside the whiskers would be considered outliers and plotted individually.*
## *This visualization helps in quickly assessing the distribution characteristics and comparing different features in the dataset.*
## *Outliers can impact model performance, so it need to be handled appropriately.*

## *The Latitude has a minimum value of 0.000000 and a maximum value of 34.328100 which means there is a potential outliers at the lower end.*
## *The Longitude also has a minimum value of -118.663400 and a maximum value of 0.000000, which is unusual for longitude values which indicates that outliers are present.*
## *Victim_Age has values ranging from -2.0 to 99.0, which seems unrealistic because of the negative age.*

## Pairplot

In [None]:
# sns.pairplot(data=data,hue='Crime_Category')

## *Insights*
## *It is a visualization tool for exploring relationships between multiple features in a dataset. By displaying scatterplots of each pair of features and the distribution of individual features helps in identifying patterns, correlations, and potential outliers. The hue-based coloring allows more detailed analysis of how different categories in the data compare with each other.*

## Violinplot

In [None]:
sns.violinplot(data=data, orient="h", palette="Set2",  scale="width")

## *Insights*
## *Violin plots are  useful for visualizing complex distributions of features and making comparisons across multiple groups or categories in a dataset.*

## f) Feature Relationships

In [None]:
# Visualizing the correlation matrix for numerical features
# Correlation matrix and heatmap 

corr_mat = data[num_features].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr_mat, annot=True)
plt.title('Correlation Matrix Heatmap')
plt.show()

## *Insights*
## *It shows the pairwise correlation between numerical features.*
## *Also helped in understanding the relationship between numerical features and identifying redundant features.*
## *High correlation between certain features indicate multicollinearity, which can affect the stability and interpretability of models.*
## *From the pair of features which are highly correlated take only one feature.*
## *This type of visualization is essential during exploratory data analysis beacuse it makes easier to make informed decisions about statistical analyses or machine learning model development.*

## g) Categorical Features Analysis

## Name of the Categorical Features

In [None]:
cat_features = data.select_dtypes(include=['object']).columns
cat_features

In [None]:
# cat_features = data.select_dtypes(include=['object']).columns
# for column in cat_features:
#     plt.figure(figsize=(10, 5))
#     sns.countplot(y=column, data=data)
#     plt.show()

## *Insights*
## *Bar plots for categorical features to visualize the distribution of each category.*
## *By analyzing the frequency of each category, you can identify the most and least common categories. This helps in understanding the distribution of the categorical features.*
## *The mode of a categorical feature indicates the most common category, which can be an important insight, especially in understanding the dominant behavior in the data.*

## Unique Label names

In [None]:
data['Crime_Category'].unique()

## Unique Label Counts

In [None]:
data['Crime_Category'].value_counts()

## Distribution of samples in each classes

In [None]:
data['Crime_Category'].value_counts().plot(kind='bar')
plt.xlabel('Crime Category')
plt.ylabel('Count')
# plt.xticks(rotation=0)
# plt.figure(figsize=(20,5))
# plt.tight_layout()

## *Insights*
## *Property crimes are the most frequent, followed by violent crimes and other.*

# 3. *Data Preprocessing and Cleaning*

In [None]:
data.head()

## Checking for null values in training and test dataset

In [None]:
data.isna().sum()

In [None]:
test_data.isna().sum()

## Convert Date_Reported and Date_Occurred to datetime for training dataset

## Extract day, month and year from Date_Reported and Date_Occured from training dataset

## Extracting features such as day, month and year from Date_Reported and Date_Occurred provides additional dimensions for the analysis.

In [None]:
# Convert 'Date_Reported' and 'Date_Occurred' to datetime for training dataset

data['Date_Reported']=pd.to_datetime(data['Date_Reported'])
data['Date_Occurred']=pd.to_datetime(data['Date_Occurred'])

# Extracting additional features from 'Date_Reported' and 'Date_Occurred' like
# year, month, day, day of week and hour

data['Reported_Year']=data['Date_Reported'].dt.year
data['Reported_Month']=data['Date_Reported'].dt.month
data['Reported_Day']=data['Date_Reported'].dt.day


data['Occurred_Year']=data['Date_Occurred'].dt.year
data['Occurred_Month']=data['Date_Occurred'].dt.month
data['Occurred_Day']=data['Date_Occurred'].dt.day




## Extract day, month and year from Date_Reported and Date_Occured from test dataset

In [None]:
# Convert 'Date_Reported' and 'Date_Occurred' to datetime for test dataset

test_data['Date_Reported']=pd.to_datetime(test_data['Date_Reported'])
test_data['Date_Occurred']=pd.to_datetime(test_data['Date_Occurred'])

# Extracting additional features from 'Date_Reported' and 'Date_Occurred' like
# year, month, day, day of week and hour

test_data['Reported_Year']=test_data['Date_Reported'].dt.year
test_data['Reported_Month']=test_data['Date_Reported'].dt.month
test_data['Reported_Day']=test_data['Date_Reported'].dt.day


test_data['Occurred_Year']=test_data['Date_Occurred'].dt.year
test_data['Occurred_Month']=test_data['Date_Occurred'].dt.month
test_data['Occurred_Day']=test_data['Date_Occurred'].dt.day

## *Insights*
## *Extracting and creating new features from the existing ones gives additional information and helps in improving the model performance.*

In [None]:
print(data.shape)
print(test_data.shape)

## Delete Date_Reported and Date_Occured feature fron train and test dataset

In [None]:
data.drop(['Date_Reported', 'Date_Occurred'], axis=1, inplace=True)
test_data.drop(['Date_Reported', 'Date_Occurred'], axis=1, inplace=True)

In [None]:
print(data.shape)
print(test_data.shape)

## Remove Outliers

In [None]:
min_latitude = 33.0
max_latitude = 34.5
min_longitude = -119.0
max_longitude = -117.0

# Remove outliers in Latitude and Longitude
data = data[(data['Latitude'] >= min_latitude) & (data['Latitude'] <= max_latitude)] 
data = data[(data['Longitude'] >= min_longitude) & (data['Longitude'] <= max_longitude)]


# Remove unrealistic Victim_Age values
data = data[(data['Victim_Age'] >= 0) & (data['Victim_Age'] <= 100)]

## Detecting missing values and imputation for training dataset

## Filling missing values with placeholders like 'Unknown' or 'None' for categorical features and with median value for numerical features, to ensure that the models can handle the incomplete data without giving any error.

In [None]:
# Checking for nan values in training dataset

data.isnull().sum()

In [None]:
# Filled NaN values for training dataset
# Filled NaN Values in categorical columns with placeholders
# Filled missing values in numerical columns with the median value


data['Cross_Street'].fillna('Unknown', inplace=True)
data['Modus_Operandi'].fillna('Unknown', inplace=True)
data['Victim_Sex'].fillna('Unknown', inplace=True)
data['Victim_Descent'].fillna('Unknown', inplace=True)
data['Premise_Description'].fillna('Unknown', inplace=True)
data['Weapon_Used_Code'].fillna(data['Weapon_Used_Code'].median(), inplace=True)
data['Weapon_Description'].fillna('None', inplace=True)

In [None]:
# Verifing for NaN values in training dataset

data.isnull().sum()

## Detecting missing values and imputation for test dataset

In [None]:
# Checking for nan values in test dataset

test_data.isnull().sum()

In [None]:
# Filled NaN values for test dataset
# Filled NaN Values in categorical columns with placeholders
# Filled missing values in numerical columns with the median value


test_data['Cross_Street'].fillna('Unknown', inplace=True)
test_data['Modus_Operandi'].fillna('Unknown', inplace=True)
test_data['Victim_Sex'].fillna('Unknown', inplace=True)
test_data['Victim_Descent'].fillna('Unknown', inplace=True)
test_data['Premise_Description'].fillna('Unknown', inplace=True)
test_data['Weapon_Used_Code'].fillna(test_data['Weapon_Used_Code'].median(), inplace=True)
test_data['Weapon_Description'].fillna('None', inplace=True)

In [None]:
# Verifing for NaN values in test dataset

test_data.isnull().sum()

In [None]:
data.shape

## *Insights*
## *Depending on the nature and proportion of missing values, considered imputation and removed some affected rows.*
## *Crime cannot happen with negative age people, so age of a person must be greater than or equal to zero.*

In [None]:
data.shape

In [None]:
test_data.shape

In [None]:
data.info()

## Numerical Features

In [None]:
numeric_features = data.select_dtypes(include=['float64', 'int32']).columns
numeric_features

## Categorical Features

In [None]:
categorical_features = data.select_dtypes(exclude=['float64', 'int32']).columns
categorical_features

## Encoded the target variable Crime_Category using Label Encoder

In [None]:
# Encoded the target variable Crime_Category

label_encoder_crime=LabelEncoder()
data['Crime_Category']=label_encoder_crime.fit_transform(data['Crime_Category'])


## *Insights*
## *Label encoding is a preprocessing step that is used to convert categorical data (labels) into a numerical format that can be used by machine learning algorithms. This is important because many algorithms can only process numerical input.*


## Encoded the categorical features using Label Encoder

In [None]:
# Encoded the categorical features


cat_columns = ['Location','Cross_Street','Area_Name','Modus_Operandi',
                'Victim_Sex','Victim_Descent','Premise_Description',
                'Weapon_Description','Status','Status_Description']

label_encoder = {}
for col in cat_columns:
    lb_enco_cat = LabelEncoder()
    data[col] = lb_enco_cat.fit_transform(data[col])
    
    #store the label encoder
    label_encoder[col] = lb_enco_cat
    

    # handle unseen labels in test data
    test_data[col] = test_data[col].map(lambda x: '<unknown>' if x not in lb_enco_cat.classes_ else x)
    
    lb_enco_cat.classes_ = np.append(lb_enco_cat.classes_, '<unknown>')
    test_data[col] = lb_enco_cat.transform(test_data[col])

## Scaling the numerical features

## Scaling numerical features helps in balancing the importance of features and ensure that the models converges faster and performs better.

In [None]:
# Scaling the numerical features

scaler = StandardScaler()
num_columns = ['Latitude','Longitude','Time_Occurred','Area_ID',
               'Reporting_District_no','Part 1-2','Victim_Age',
               'Premise_Code','Weapon_Used_Code','Reported_Year',
               'Reported_Month','Reported_Day','Occurred_Year',
               'Occurred_Month','Occurred_Day']

data[num_columns] = scaler.fit_transform(data[num_columns])
test_data[num_columns] = scaler.transform(test_data[num_columns])

In [None]:
data.head()

In [None]:
test_data.head()

## Drop the label from training dataset

In [None]:
X=data.drop('Crime_Category',axis=1)
X.head()


## Label

In [None]:
y=data['Crime_Category']
y.head()

## shape of feature matrix

In [None]:
print(X.shape)

## shape of label

In [None]:
print(y.shape)

In [None]:
type_of_target(y)

## Training-test split

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
print(X_train.shape)
print(X_val.shape)

print(y_train.shape)
print(y_val.shape)

# Feature Extraction

## Model based feature Importance

## Understanding feature importance helps in refining the model and focusing on the most influential features.

In [None]:
# Training a Random Forest Classifier model to compute feature importance
ran_for_model = RandomForestClassifier(n_estimators=100, random_state=42)
ran_for_model.fit(X_train, y_train)

# Get feature importances
important_feature = ran_for_model.feature_importances_
print(important_feature)

important_feature_frame = pd.DataFrame({'Feature': X.columns, 'Importance': important_feature})
important_feature_frame.sort_values(by='Importance', ascending=False, inplace=True)

# Top 18 Important features
top_features = important_feature_frame.head(18)
print(top_features)

In [None]:
# Select top 18 Important features for training
print("\nTop 18 Important features\n")
new_selected_features = list(top_features['Feature'])
print(new_selected_features)
X_train_new=X_train[new_selected_features]
X_val_new=X_val[new_selected_features]
test_data_new = test_data[new_selected_features]
X_train_new[:5]

In [None]:
# Training the Random Forest Classifier model with the 10 selected features
r_f_c_model = RandomForestClassifier(n_estimators=100, random_state=42)
r_f_c_model.fit(X_train_new, y_train)
# Predict on the validation set
y_pred=r_f_c_model.predict(X_val_new)
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
#  y_val_new = y_val.values
# classification_report = classification_report(y_val_new, y_pred)
print('Validation Accuracy: {} '.format(accuracy))
# print('Classification_report: {}'.format(classification_report))
# Predict on the test set
test_pred_rf=r_f_c_model.predict(test_data_new)

In [None]:
# Training the XGBoost Classifier model with the 10 selected features
# xgb_model = XGBClassifier(objective='multi:softmax',num_class= 6,depth=4,learning_rate=0.1)
xgb_model = XGBClassifier(n_estimators=100, random_state=42)
xgb_model.fit(X_train_new, y_train)

# Predict on the validation set
y_pred=xgb_model.predict(X_val_new)

# Evaluate the model
accuracy=accuracy_score(y_val, y_pred)
# y_val_new = y_val.values
# classification = classification_report(y_val_new, y_pred)

print('Validation Accuracy: {} '.format(accuracy))
# print('Classification_report: {}'.format(classification))

# Predict on the test set
test_pred_xgb=xgb_model.predict(test_data_new)

In [None]:
# # Define the model
# log_model = LogisticRegression(max_iter=1000, random_state=42)

# # Initialize variables
# selected_features_lg = []
# all_features = list(X_train.columns)
# best_score = 0


# while all_features:
#     scores = []
#     for feat in all_features:
#         # Try adding the feature to the set of selected features
#         current_features = selected_features_lg + [feat]
#         X_train_1 = X_train[current_features]
#         # Perform cross-validation and compute the average score
#         score = cross_val_score(log_model, X_train_1, y_train, cv=5, scoring='accuracy').mean()
#         scores.append((score, feat))
    
    
#     # Get the best feature and score
#     scores.sort(reverse=True)
#     new_best_score, new_best_feature = scores[0]
#     print('scores:',scores)
#     print('new_best_score:',new_best_score)
#     print('new_best_feature:',new_best_feature)
    
#     if new_best_score > best_score:
#         selected_features_lg.append(new_best_feature)
#         all_features.remove(new_best_feature)
#         best_score = new_best_score
#         print(f"Added {new_best_feature} with score {new_best_score}")
#         print('selected_features_lg:',selected_features_lg)
#     else:
#         # No improvement
#         print("Thats all")
#         break

# print(f"Selected features lg: {selected_features_lg}")

In [None]:
# selected_features_lg=['Weapon_Description', 'Weapon_Used_Code', 'Victim_Sex', 'Part 1-2', 'Premise_Code', 
#                       'Status_Description', 'Victim_Age', 'Victim_Descent', 'Area_Name', 'Longitude']

# X_train_new_1=X_train[selected_features_lg]
# X_val_new_1=X_val[selected_features_lg]
# test_data_new_1 = test_data[selected_features_lg]
# X_train_new_1[:5]

In [None]:
# # Training the Random Forest Classifier model with the 10 selected features
# r_f_c_model = RandomForestClassifier(n_estimators=100, random_state=42)
# r_f_c_model.fit(X_train_new_1, y_train)
# # Predict on the validation set
# y_pred=r_f_c_model.predict(X_val_new_1)
# # Evaluate the model
# accuracy = accuracy_score(y_val, y_pred)
# #  y_val_new = y_val.values
# # classification_report = classification_report(y_val_new, y_pred)
# print('Validation Accuracy: {} '.format(accuracy))
# # print('Classification_report: {}'.format(classification_report))
# # Predict on the test set
# test_pred_rf_1=r_f_c_model.predict(test_data_new_1)
# print(test_pred_rf_1[:5])

In [None]:
# # Training the XGBoost Classifier model with the 10 selected features
# # xgb_model = XGBClassifier(objective='multi:softmax',num_class= 6,depth=4,learning_rate=0.1)
# xgb_model = XGBClassifier(n_estimators=100, random_state=42)
# xgb_model.fit(X_train_new_1, y_train)

# # Predict on the validation set
# y_pred=xgb_model.predict(X_val_new_1)

# # Evaluate the model
# accuracy=accuracy_score(y_val, y_pred)
# # y_val_new = y_val.values
# # classification = classification_report(y_val_new, y_pred)

# print('Validation Accuracy: {} '.format(accuracy))
# # print('Classification_report: {}'.format(classification))

# # Predict on the test set
# test_pred_xgb_1=xgb_model.predict(test_data_new_1)
# print(test_pred_xgb_1[:5])

In [None]:
# # Define the model
# xgb_model = XGBClassifier(n_estimators=100, random_state=42)

# # Initialize variables
# selected_features_xgb = []
# all_features = list(X_train.columns)
# best_score = 0

# while all_features:
#     scores = []
#     for feat in all_features:
#         # Try adding the feature to the set of selected features
#         current_features = selected_features_xgb + [feat]
#         X_train_2 = X_train[current_features]
#         # Perform cross-validation and compute the average score
#         score = cross_val_score(xgb_model, X_train_2, y_train, cv=5, scoring='accuracy').mean()
#         scores.append((score, feat))
    
    
#     # Get the best feature and score
#     scores.sort(reverse=True)
#     new_best_score, new_best_feature = scores[0]
#     print('scores:',scores)
#     print('new_best_score:',new_best_score)
#     print('new_best_feature:',new_best_feature)
    
#     if new_best_score > best_score:
#         selected_features_xgb.append(new_best_feature)
#         all_features.remove(new_best_feature)
#         best_score = new_best_score
#         print(f"Added {new_best_feature} with score {new_best_score}")
#         print('selected_features_xgb:',selected_features_xgb)
#     else:
#         # No improvement
#         print("Thats all")
#         break

# print(f"Selected features xgb: {selected_features_xgb}")

In [None]:
# selected_features_xgb=['Modus_Operandi', 'Weapon_Description', 'Part 1-2', 'Premise_Code', 'Victim_Age', 
#                        'Status_Description', 'Reported_Year', 'Weapon_Used_Code']

# X_train_new_2=X_train[selected_features_xgb]
# X_val_new_2=X_val[selected_features_xgb]
# test_data_new_2 = test_data[selected_features_xgb]
# X_train_new_2[:5]

In [None]:
# # Training the Random Forest Classifier model with the 10 selected features
# r_f_c_model = RandomForestClassifier(n_estimators=100, random_state=42)
# r_f_c_model.fit(X_train_new_2, y_train)
# # Predict on the validation set
# y_pred=r_f_c_model.predict(X_val_new_2)
# # Evaluate the model
# accuracy = accuracy_score(y_val, y_pred)
# #  y_val_new = y_val.values
# # classification_report = classification_report(y_val_new, y_pred)
# print('Validation Accuracy: {} '.format(accuracy))
# # print('Classification_report: {}'.format(classification_report))
# # Predict on the test set
# test_pred_rf_2=r_f_c_model.predict(test_data_new_2)
# print(test_pred_rf_2[:5])

In [None]:
# # Training the XGBoost Classifier model with the 10 selected features
# # xgb_model = XGBClassifier(objective='multi:softmax',num_class= 6,depth=4,learning_rate=0.1)
# xgb_model = XGBClassifier(n_estimators=100, random_state=42)
# xgb_model.fit(X_train_new_2, y_train)

# # Predict on the validation set
# y_pred=xgb_model.predict(X_val_new_2)

# # Evaluate the model
# accuracy=accuracy_score(y_val, y_pred)
# # y_val_new = y_val.values
# # classification = classification_report(y_val_new, y_pred)

# print('Validation Accuracy: {} '.format(accuracy))
# # print('Classification_report: {}'.format(classification))

# # Predict on the test set
# test_pred_xgb_2=xgb_model.predict(test_data_new_2)
# print(test_pred_xgb_2[:5])

In [None]:
# from sklearn.multiclass import OneVsRestClassifier
# svc=LinearSVC(random_state=0)
# rest=OneVsRestClassifier(svc)
# rest.fit(X_train,y_train)
# n=rest.predict(X_val)
# print(n[:5])
# s_c=accuracy_score(y_val,n)
# print(s_c)

In [None]:
# from sklearn.multiclass import OneVsOneClassifier
# svc=LinearSVC(random_state=0)
# one=OneVsOneClassifier(svc)
# one.fit(X_train,y_train)
# y_one_pred=one.predict(X_val)
# print(y_one_pred[:5])
# s_c=accuracy_score(y_val,y_one_pred)
# print(s_c)

In [None]:
# model=DummyClassifier(strategy='most_frequent')
# model.fit(X_train, y_train)
# y_pred=model.predict(X_val)
# s_c=accuracy_score(y_val,y_pred)
# print(s_c)

In [None]:
# svm_model=SVC(random_state=42)
# svm_model.fit(X_train,y_train)
# y_svm_pred=svm_model.predict(X_val)
# print(y_svm_pred[:5])
# svm_acc=accuracy_score(y_val,y_svm_pred)
# print(svm_acc)

In [None]:
#  svm_model=SVC(random_state=42)
# # param_grid = {'C': [0.1, 1, 10, 100, 1000],  
# #               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
# #               'kernel': ['rbf']}  
# param_grid = {'C': [0.1, 1, 10],  
#               'gamma': [1, 0.1, 0.01], 
#               'kernel': ['rbf', 'ploy'],
#               'degree':[5,7]}  
# svm_gscv = GridSearchCV(svm_model, param_grid, refit = True, verbose = 3)
# svm_gscv.fit(X_train, y_train)
# print(svm_gscv.best_params_)
# y_p = svm_gscv.best_estimator_.predict(X_val)
# accuracy_score(y_val,y_p)


In [None]:
# svm_model=SVC(kernel="poly",random_state=42,degree=7,gamma=1, C=1.0)
# svm_model.fit(X_train,y_train)
# y_svm_pred=svm_model.predict(X_val)
# print(y_svm_pred[:5])
# svm_acc=accuracy_score(y_val,y_svm_pred)
# print(svm_acc)

# Model building

## Tuning the model using XGBClassifier using 18 important features

In [None]:
# class_1=XGBClassifier(random_state=42)
# param_grid = {
#     'learning_rate' : [0.1,0.15,0.2],
#     'n_estimators': [100,115,110],
#     'max_depth' : [7,8],              
#     'min_child_weight' : [3,4,5],
#     'subsample': [0.9,1.0,1.1],
#     'gamma' : [0.1,0.2],            
#     'colsample_bytree':[0.4,0.5,0.6]    
# }

# gscv_1 = GridSearchCV(class_1, param_grid, n_jobs=-1, cv=cv, verbose=2)
# gscv_1.fit(X_train_new,y_train)

In [None]:
# print(gscv_1.best_params_)
# print('Best parameters: {}'.format(gscv_1.best_params_))
# print('Best cross-validation score: {}'.format(gscv_1.best_score_))

In [None]:
# xgb_model_1=gscv_1.best_estimator_
# y_p = xgb_model_1.predict(X_val_new)
# print(y_p[:5])
# accuracy=accuracy_score(y_val,y_p)
# print('Validation Accuracy: {} '.format(accuracy))
# test_pred_xgb=xgb_model.predict(test_data_new)

In [None]:
classifier_1=XGBClassifier(colsample_bytree=0.4, gamma=0.1, learning_rate=0.1, 
                         max_depth=7, min_child_weight= 3, n_estimators= 115, 
                         subsample=1.0)

classifier_1.fit(X_train_new,y_train)
y_pred=classifier_1.predict(X_val_new)
print(y_pred[:5])
acc=accuracy_score(y_val,y_pred)
print('Validation Accuracy: {} '.format(acc))
test_pred_1=classifier_1.predict(test_data_new)

In [None]:
# class_2=XGBClassifier(random_state=42)
# param_grid = {
#     'learning_rate' : [0.1,0.15,0.2],
#     'n_estimators': [100,115,110],
#     'max_depth' : [7,8],              
#     'min_child_weight' : [3,4,5],
#     'subsample': [0.9,1.0,1.1],
#     'gamma' : [0.1,0.2],            
#     'colsample_bytree':[0.4,0.5,0.6]    
# }

# gscv_2 = GridSearchCV(class_2, param_grid, n_jobs=-1, cv=cv, verbose=2)
# gscv_2.fit(X_train_new_1,y_train)

In [None]:
# print(gscv_2.best_params_)

In [None]:
# classifier_2=XGBClassifier(colsample_bytree=0.6, gamma=0.2, learning_rate=0.1, 
#                          max_depth=7, min_child_weight= 4, n_estimators= 100, 
#                          subsample=1.0)
# classifier_2.fit(X_train_new_1,y_train)
# y_pred=classifier_2.predict(X_val_new_1)
# print(y_pred[:5])
# acc=accuracy_score(y_val,y_pred)
# print('Validation Accuracy: {} '.format(acc))
# test_pred_2=classifier_2.predict(test_data_new_1)

In [None]:
# class_3=XGBClassifier(random_state=42)
# param_grid = {
#     'learning_rate' : [0.1,0.15,0.2],
#     'n_estimators': [100,115,110],
#     'max_depth' : [7,8],              
#     'min_child_weight' : [3,4,5],
#     'subsample': [0.9,1.0,1.1],
#     'gamma' : [0.1,0.2],            
#     'colsample_bytree':[0.4,0.5,0.6]    
# }

# gscv_3 = GridSearchCV(class_3, param_grid, n_jobs=-1, cv=cv, verbose=2)
# gscv_3.fit(X_train_new_2,y_train)

In [None]:
# print(gscv_3.best_params_)

In [None]:
# classifier_3=XGBClassifier(colsample_bytree=0.5, gamma=0.1, learning_rate=0.1, 
#                          max_depth=8, min_child_weight= 4, n_estimators= 100, 
#                          subsample=0.9)

# classifier_3.fit(X_train_new_2,y_train)
# y_pred=classifier_3.predict(X_val_new_2)
# print(y_pred[:5])
# acc=accuracy_score(y_val,y_pred)
# print('Validation Accuracy: {} '.format(acc))
# test_pred_3=classifier_3.predict(test_data_new_2)

## Tuning of model hyperparameters using techniques like GridSearchCV to find the optimal parameters for each model.

# *Hyperparameter Tuning for Random Forest*

## Random Forests are robust and are less prone to overfitting. It works well with a large number of features and can provide feature importance and help in understanding which features contribute the most to the prediction.

In [None]:
# # Define the model
# rf_model = RandomForestClassifier(random_state=42)

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2']
# }

# # Set up GridSearchCV
# grid_search_cv_1 = GridSearchCV(rf_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# # Fit the model
# grid_search_cv_1.fit(X_train_new, y_train)

# # Get the best parameters and score
# rf_best_params = grid_search_cv.best_params_
# rf_best_score = grid_search_cv.best_score_
# print('Random Forest Best Parameter: {}'.format(rf_best_params))
# print('Random Forest Best Cross-Validation Score: {}'.format(rf_best_score))

In [None]:
# print('Random Forest Best Parameter: {}'.format(rf_best_params))
# print('Random Forest Best Cross-Validation Score: {}'.format(rf_best_score))

# Random Forest Best Parameter: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
# Random Forest Best Cross-Validation Score: 0.9072303233761279

In [None]:
forest = RandomForestClassifier(max_depth= 20, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 5, 
                                n_estimators= 500)

forest.fit(X_train_new,y_train)
y_pred = forest.predict(X_val_new)
print(y_pred[:5])

#  Calculate the test accuracy
accuracy = accuracy_score(y_val,y_pred)

# Generate the classification report
classification_report_forest = classification_report(y_val, y_pred)

# Generate the confusion matrix
confusion_matrix_forest = confusion_matrix(y_val, y_pred)

print('Validation Accuracy: {}'.format(accuracy))
print("\nClassification Report:\n", classification_report_forest)
# print("\nConfusion Matrix:\n", confusion_matrix_forest)

# Display the confusion matrix using ConfusionMatrixDisplay
display = ConfusionMatrixDisplay(confusion_matrix_forest, display_labels=np.unique(y_val))
display.plot(cmap='Blues')
plt.title('\nConfusion Matrix for Random Forest Model\n')
plt.show()

test_pred_forest = forest.predict(test_data_new)

## *Insights*
## *The parameters of RandomForestClassifier provide flexibility to control the complexity and performance of the model. By adjusting these parameters, we can improve the model's accuracy, reduce overfitting, and optimize the performance of the dataset.*


## Hyperparameter Tuning for XGBoost

## XGBoost (Extreme Gradient Boosting) is a flexible and powerful model and perform well on  variety of tasks due to its ability to handle different types of data. And because of its built-in regularization.
## Hyperparameter tuning e.g., learning rate, number of estimators and max depth can improve performance.

In [None]:
# # Define the model
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',random_state=42)

# # # Define the parameter grid
# param_grid = {
    
#     'n_estimators': [70,80,90,100,125],
#     'max_depth': np.arange(10,13),
#     'learning_rate': [0.01,0.05,0.1],
#     'subsample': [0.7,0.8,0.9],
#     'colsample_bytree': [0.4,0.5,0.6],
#     'gamma': [0.01, 0.1, 0.2],
#     'min_child_weight': [5, 8, 10],
#     'reg_alpha': [0.01, 0.1, 1],
#     'reg_lambda': [0.001, 0.01, 0.1]
# }


# # param_grid = {
# #     'n_estimators': [100, 200, 300],
# #     'learning_rate': [0.01, 0.1, 0.2],
# #     'max_depth': [3, 6, 9],
# #     'min_child_weight': [1, 5, 10],
# #     'gamma': [0, 0.1, 0.2],
# #     'subsample': [0.8, 0.9, 1.0],
# #     'colsample_bytree': [0.8, 0.9, 1.0]
# # }

# # param_grid = {
# #     'n_estimators': [200,225,250,300],
# # #     'learning_rate': [0.001,0.005,0.01,0.05,0.1,0.2],
# #     'max_depth': [9, 10,11,12],
# # #     'min_child_weight': [1, 5, 10],
# #     'gamma': [0.05,0.01,0.08,0.005,0.001],
# # #     'subsample': [0.8, 0.9, 1.0],
# # #     'colsample_bytree': [0.8, 0.9, 1.0]
# # }




# param_grid = { 'n_estimators': [100, 200, 300, 500, 1000],
#               'learning_rate': [0.001,0.005,0.01,0.05,0.02],
#               'subsample': np.arange()
# }






















# random_search_cv = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=50, cv=5, 
#                                    scoring='accuracy', verbose=1, n_jobs=-1)

# grid_search_cv_2 = GridSearchCV(xgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# # Fit the random search model
# grid_search_cv_2.fit(X_train_new, y_train)

# # Output the best parameters and score
# xgb_best_params = grid_search_cv_2.best_params_
# xgb_best_score = grid_search_cv_2.best_score_
# print('Best parameters: {}'.format(xgb_best_params))
# print('Best cross-validation score: {}'.format(xgb_best_score))

In [None]:
# print('Best parameters: {}'.format(xgb_best_params))
# print('Best cross-validation score: {}'.format(xgb_best_score))


# Best parameters: {'subsample': 0.6, 'reg_lambda': 0.001, 'reg_alpha': 1, 'n_estimators': 200, 
# 'min_child_weight': 9, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0.0, 'colsample_bytree': 0.5}
# Best cross-validation score: 0.9227174414906039
# [2 5 5 5 5]
# Validation Accuracy: 0.9266666666666666

# Best parameters: {'subsample': 0.8, 'reg_lambda': 0.001, 'reg_alpha': 1, 'n_estimators': 70, 
# 'min_child_weight': 8, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0.01, 'colsample_bytree': 0.5}
# Best cross-validation score: 0.9227174414906039
# [2 5 5 5 5]

In [None]:
# pipe_1 = Pipeline(steps=[('scaler', StandardScaler()),
#                        ('xgb', XGBClassifier(colsample_bytree = 0.7, gamma = 0.1, learning_rate = 0.01, 
#                          max_depth = 9, min_child_weight = 1, n_estimators = 500, subsample = 0.8))])

# pipe_1 = Pipeline(steps=[('scaler', Standarder()),
#                        ('xgb', XGBClassifier(subsample = 0.7, reg_lambda = 0, reg_alpha = 0.1, n_estimators = 300, 
#                         min_child_weight = 7, max_depth = 6, learning_rate = 0.05, gamScalma = 0.4, colsample_bytree = 0.7))])

# pipe_1 = Pipeline(steps=[('scaler', StandardScaler()),
#                        ('xgb', XGBClassifier(subsample = 0.9999999999999999, reg_lambda = 0.01, reg_alpha = 0, 
#                         n_estimators = 280, min_child_weight = 11, max_depth = 12, learning_rate = 0.1, gamma = 0.8, 
#                         colsample_bytree = 0.5))])



# pipe_1 = Pipeline(steps=[('scaler', StandardScaler()),
#                        ('xgb', XGBClassifier(objective='multi:softmax',num_class=6,n_estimators = 200, min_child_weight = 9, max_depth = 10, learning_rate = 0.1, gamma = 0.01, 
#                         ))])






pipe_1 = Pipeline(steps=[('scaler', StandardScaler()),
                       ('xgb', XGBClassifier(subsample = 0.8, reg_lambda = 0.001, n_estimators = 650, 
                        min_child_weight = 8, max_depth = 12, learning_rate = 0.1, gamma = 0.01, 
                        colsample_bytree = 0.5))])

                         
# pipe_1 = Pipeline(steps=[('scaler', StandardScaler()),
#                        ('xgb', XGBClassifier(learning_rate = 0.01, n_estimators = 650, subsample=0.8,
#                          min_samples_split=2,min_samples_leaf=2,min_weight_fraction_leaf=0.5,loss='softmax',
#                          min_child_weight = 8,max_depth = 12,min_impurity_decrease=0.5,
#                           max_features='sqrt',n_iter_no_change=5, tol=0.001,colsample_bytree=0.5))])

pipe_1.fit(X_train_new,y_train)
                 
y_pred = pipe_1.predict(X_val_new)
print(y_pred[:5])


# Calculate the test accuracy
accuracy = accuracy_score(y_val,y_pred)

# Generate the classification report
classification_report_xgb = classification_report(y_val, y_pred)

# Generate the confusion matrix
confusion_matrix_xgb = confusion_matrix(y_val, y_pred)

print('Validation Accuracy: {}'.format(accuracy))
print("\nClassification Report:\n", classification_report_xgb)

test_pred_xgb_1 = pipe_1.predict(test_data_new)

# Display the confusion matrix using ConfusionMatrixDisplay
display = ConfusionMatrixDisplay(confusion_matrix_xgb, display_labels=np.unique(y_val))
display.plot(cmap='Blues')
plt.title('\nConfusion Matrix for XGBoost Model\n')
plt.show()

## *Insights*
## *It is a powerful tool for classification , it combine the strength of gradient boosting with optimization that make it fast and effective. The flexibility of various parameters allow us to fine-tune the modelel.*

In [None]:
X_train_new.head()

## Hyperparameter Tuning for Logistic Regression

## Logistic Regression is a simple but effective linear model for classification problems, especially when the relationship between the features and the target variable is approximately linear.
## It provides probabilistic outputs and can be regularized to prevent overfitting, making it suitable for high-dimensional datasets.

In [None]:
# # Define the model
# lg_model = LogisticRegression(max_iter=10000,random_state=42)

# # Define the parameter grid
# param_grid = {
#     'penalty': ['l1', 'l2', 'elasticnet'],
# #     'penalty': ['l2'],
#     'C': [0.01, 0.1, 1, 10, 100],
#     'solver': ['liblinear', 'saga']
# }

# # Set up GridSearchCV
# grid_search_cv_3 = GridSearchCV(lg_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# # Fit the model
# grid_search_cv_3.fit(X_train_new, y_train)

# # Get the best parameters and score
# lg_best_params = grid_search_cv_3.best_params_
# lg_best_score = grid_search_cv_3.best_score_
# print('Logistic Regression Best Parameter: {}'.format(lg_best_params))
# print('RLogistic Regression Best Cross-Validation Score: {}'.format(lg_best_score))

In [None]:
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('logistic', LogisticRegression(max_iter=20000, tol=1e-4))  
# ])

# param_grid = {
#     'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100],  
#     'logistic__penalty': ['l2'],  
#     'logistic__solver': ['liblinear', 'saga']  
# }

# # Set up GridSearchCV
# grid_search_cv_2 = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid_search_cv_2.fit(X_train_new, y_train)

# # Get the best parameters and score
# lg_best_params = grid_search_cv_2.best_params_
# lg_best_score = grid_search_cv_2.best_score_
# print('Logistic Regression Best Parameter: {}'.format(lg_best_params))
# print('RLogistic Regression Best Cross-Validation Score: {}'.format(lg_best_score))

In [None]:
# print('Logistic Regression Best Parameter: {}'.format(lg_best_params))
# print('Logistic Regression Best Cross-Validation Score: {}'.format(lg_best_score))

# # Logistic Regression Best Parameter: {'logistic__C': 100, 'logistic__penalty': 'l2', 'logistic__solver': 'saga'}
# # Logistic Regression Best Cross-Validation Score: 0.7994816585225347

In [None]:
pipe_2 = Pipeline(steps=[('scaler', StandardScaler()),
                       ('logreg', LogisticRegression(C = 100, penalty = 'l2', solver = 'saga', max_iter = 10000, 
                        multi_class = 'multinomial', dual = False, tol = 0.0001))])

pipe_2.fit(X_train_new, y_train)

y_pred = pipe_2.predict(X_val_new)
print(y_pred[:5])

# Calculate the test accuracy
accuracy = accuracy_score(y_val,y_pred)

# Generate the classification report
classification_report_lg = classification_report(y_val, y_pred)

# Generate the confusion matrix
confusion_matrix_lg = confusion_matrix(y_val, y_pred)

print('Validation Accuracy: {}'.format(accuracy))
print("\nClassification Report:\n", classification_report_lg)

test_pred_lg = pipe_2.predict(test_data_new)

# Display the confusion matrix using ConfusionMatrixDisplay
display = ConfusionMatrixDisplay(confusion_matrix_lg, display_labels=np.unique(y_val))
display.plot(cmap='Blues')
plt.title('\nConfusion Matrix for Logistic Regression Model\n')
plt.show()

## *Insights*
## *Logistic regression (using the softmax function) is often preferred for its ability to directly model the probabilities of multiple classes.*

In [None]:
# print(pipe_2[-1].coef_.shape)
# print(pipe_2[-1].intercept_.shape)
# print(pipe_2[-1].classes_)

## Gaussian Naive Bayes model

In [None]:
# # Define the model
# naive_bayes_model = GaussianNB()

# param_grid = {
#     'var_smoothing': np.logspace(-9, -6, 10)  
# }

# # Set up GridSearchCV
# grid_search_cv_2 = GridSearchCV(naive_bayes_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1,n_jobs=-1)

# # Fit the model
# grid_search_cv_2.fit(X_train_new, y_train)

# # Get the best parameters and score
# nb_best_params = grid_search_cv_2.best_params_
# nb_best_score = grid_search_cv_2.best_score_
# print('Naive Bayes Best Parameter: {}'.format(nb_best_params))
# print('Naive Bayes Best Cross-Validation Score: {}'.format(nb_best_score))

In [None]:
# print('Naive Bayes Best Parameter: {}'.format(nb_best_params))
# print('Naive Bayes Best Cross-Validation Score: {}'.format(nb_best_score))

# Naive Bayes Best Parameter: {'var_smoothing': 1e-07}
# Naive Bayes Best Cross-Validation Score: 0.7492634824290336

In [None]:
# nb = GaussianNB(var_smoothing = 1e-07)
# nb.fit(X_train_new,y_train)

# y_pred = nb.predict(X_val_new)
# print(y_pred[:5])

# # Calculate the test accuracy
# accuracy = accuracy_score(y_val,y_pred)

# # Generate the classification report
# classification_report_nb = classification_report(y_val, y_pred)

# # Generate the confusion matrix
# confusion_matrix_nb = confusion_matrix(y_val, y_pred)

# print('Validation Accuracy: {}'.format(accuracy))
# print("\nClassification Report:\n", classification_report_nb)

# test_pred_nb = nb.predict(test_data_new)

# # Display the confusion matrix using ConfusionMatrixDisplay
# display = ConfusionMatrixDisplay(confusion_matrix_nb, display_labels=np.unique(y_val))
# display.plot(cmap='Blues')
# plt.title('\nConfusion Matrix for Naive Bayes Model\n')
# plt.show()

## Accuracy: Gives a general idea of each model's performance level. 
## Precision, recall, and F1-score are included in the classification report, which provides additional insights into the performance of the model, especially for datasets that are imbalanced.

## XGBClassifier and RandomForestClassifier models can provide feature importance scores and indicate which features are most influential in making predictions.

## To compare the three models Random Forest, XGBoost, and Logistic Regression after hyperparameter tuning, will use the best cross-validation scores obtained for each model.

In [None]:
# rf_best_params = {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 500}
# rf_best_score = 0.9072303233761279

# xgb_best_params = {'subsample': 0.8, 'reg_lambda': 0.001, 'reg_alpha': 1, 'n_estimators': 70, 
#                    'min_child_weight': 8, 'max_depth': 12, 'learning_rate': 0.1, 'gamma': 0.01, 
#                    'colsample_bytree': 0.5}
# xgb_best_score = 0.9227174414906039

# lg_best_params = {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
# lg_best_score = 0.7994816585225347


In [None]:
# model_comparison = {
#     'Model': ['Random Forest', 'XGBoost', 'Logistic Regression'],
#     'Best Score': [rf_best_score, xgb_best_score, lg_best_score],
#     'Best Parameters': [rf_best_params, xgb_best_params, lg_best_params]
# }

# # Create the DataFrame 

# model_comparison_data_frame = pd.DataFrame(model_comparison)

# # Print the  model comparison DataFrame
# print("Model Comparison:")
# print(model_comparison_data_frame)

# # Determine the best model based on the best score
# best_model_index = model_comparison_data_frame['Best Score'].idxmax()    # idxmax() returns the index for the maximum value in each column.
# print("\nBest_model_index",best_model_index)
# best_model_name = model_comparison_data_frame['Model'][best_model_index]
# best_model_score = model_comparison_data_frame['Best Score'][best_model_index]


# print('\nBest Model: {}'.format(best_model_name))
# print('Best Score: {}'.format(best_model_score))
# print('Best Parameters: {}'.format(model_comparison_data_frame['Best Parameters'][best_model_index]))

## KNN Model

In [None]:
# knn_model = KNeighborsClassifier()

# # Define the parameter grid
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9, 11, 13, 15],  
#     'weights': ['uniform', 'distance'],      
#     'metric': ['euclidean', 'manhattan', 'minkowski'] 
# }

# grid_search_knn = GridSearchCV(knn_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# # Fit the GridSearchCV
# grid_search_knn.fit(X_train_new, y_train)

# # Get the best parameters and best score
# best_params_knn = grid_search_knn.best_params_
# best_score_knn = grid_search_knn.best_score_

# print('Best Parameters for KNN: {}'.format(best_params_knn))
# print('Best Cross-Validation Accuracy for KNN: {}'.format(best_score_knn))

In [None]:
# print('Best Parameters for KNN: {}'.format(best_params_knn))
# print('Best Cross-Validation Accuracy for KNN: {}'.format(best_score_knn))

# Best Parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
# Best Cross-Validation Accuracy for KNN: 0.7776479066041542

In [None]:
# knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 9, weights = 'distance')
# knn.fit(X_train_new,y_train)

# y_pred = knn.predict(X_val_new)
# print(y_pred[:5])

# # Calculate the test accuracy
# accuracy = accuracy_score(y_val,y_pred)

# # Generate the classification report
# classification_report_knn = classification_report(y_val, y_pred)

# # Generate the confusion matrix
# confusion_matrix_knn = confusion_matrix(y_val, y_pred)

# print('Validation Accuracy: {}'.format(accuracy))
# print("\nClassification Report:\n", classification_report_knn)

# test_pred_knn = knn.predict(test_data_new)

# # Display the confusion matrix using ConfusionMatrixDisplay
# display = ConfusionMatrixDisplay(confusion_matrix_knn, display_labels=np.unique(y_val))
# display.plot(cmap='Blues')
# plt.title('\nConfusion Matrix for KNN Model\n')
# plt.show()

## SVM Model

In [None]:
# svm_model = SVC((random_state=42)

# # Define the parameter grid 
# param_grid = {
#     'C': [0.1, 1, 10, 100],  
#     'gamma': [1, 0.1, 0.01, 0.001], 
#     'kernel': ['rbf', 'ploy']  
# }

# grid_search_svm = GridSearchCV(svm_model, param_grid, refit = True, verbose = 3)

# # Fit the GridSearchCV
# grid_search_svm.fit(X_train_new, y_train)

# # Get the best parameters and best score
# best_params_svm = grid_search_svm.best_params_
# best_score_svm = grid_search_svm.best_score_

# print('Best Parameters for SVM: {}'.format(best_params_svm))
# print('Best Cross-Validation Accuracy for SVM: {}'.format(best_score_svm))

In [None]:
# print('Best Parameters for SVM: {}'.format(best_params_svm))
# print('Best Cross-Validation Accuracy for SVM: {}'.format(best_score_svm))

# kernel="poly",random_state=42,gamma=1, C=1.0

In [None]:
# svm = SVC()
# svm.fit(X_train_new,y_train)

# y_pred = svm.predict(X_val_new)
# print(y_pred[:5])

# # Calculate the test accuracy
# accuracy = accuracy_score(y_val,y_pred)

# # Generate the classification report
# classification_report_svm = classification_report(y_val, y_pred)

# # Generate the confusion matrix
# confusion_matrix_svm = confusion_matrix(y_val, y_pred)

# print('Validation Accuracy: {}'.format(accuracy))
# print("\nClassification Report:\n", classification_report_svm)

# test_pred_svm = svm.predict(test_data_new)

# # Display the confusion matrix using ConfusionMatrixDisplay
# display = ConfusionMatrixDisplay(confusion_matrix_svm, display_labels=np.unique(y_val))
# display.plot(cmap='Blues')
# plt.title('\nConfusion Matrix for SVM Model\n')
# plt.show()

## MLP Model (Multi-Layer Perceptron) 

In [None]:
# mlp_model = MLPClassifier(max_iter=10000, random_state=42)

# # Define the parameter grid 
# param_grid = {
#     'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100, 50)],  
#     'activation': ['tanh', 'relu'],  
#     'solver': ['adam', 'sgd'], 
#     'alpha': [0.0001, 0.001, 0.01],  
#     'learning_rate': ['constant', 'adaptive']  
# }

# # Set up GridSearchCV
# grid_search_mlp = GridSearchCV(mlp_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# # Fit the GridSearchCV
# grid_search_mlp.fit(X_train_new, y_train)

# # Get the best parameters and best score
# best_params_mlp = grid_search_mlp.best_params_
# best_score_mlp = grid_search_mlp.best_score_

# print('Best Parameters for MLP: {}'.format(best_params_mlp))
# print('Best Cross-Validation Accuracy for MLP: {}'.format(best_score_mlp))

In [None]:
# print('Best Parameters for MLP: {}'.format(best_params_mlp))
# print('Best Cross-Validation Accuracy for MLP: {}'.format(best_score_mlp))

# Best Parameters for MLP: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 100, 50), 'learning_rate': 'constant', 'solver': 'adam'}
# Best Cross-Validation Accuracy for MLP: 0.7642922779600693

In [None]:
# mlp = MLPClassifier(activation = 'logistic', alpha = 0.001, hidden_layer_sizes= (100, 100, 50), learning_rate = 'constant', solver = 'adam')
# mlp.fit(X_train_new,y_train)

# y_pred = mlp.predict(X_val_new)
# print(y_pred[:5])

# # Calculate the test accuracy
# accuracy = accuracy_score(y_val,y_pred)

# # Generate the classification report
# classification_report_mlp = classification_report(y_val, y_pred)

# # Generate the confusion matrix
# confusion_matrix_mlp = confusion_matrix(y_val, y_pred)

# print('Validation Accuracy: {}'.format(accuracy))
# print("\nClassification Report:\n", classification_report_mlp)

# test_pred_mlp = mlp.predict(test_data_new)

# # Display the confusion matrix using ConfusionMatrixDisplay
# display = ConfusionMatrixDisplay(confusion_matrix_mlp, display_labels=np.unique(y_val))
# display.plot(cmap='Blues')
# plt.title('\nConfusion Matrix for MLP Model\n')
# plt.show()

## Function to Plot Learning Curves

In [None]:
# def plot_learning_curve(estimator, title, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)):
#     plt.figure()
#     plt.title(title)
#     plt.xlabel("Training examples")
#     plt.ylabel("Score")
    
#     train_sizes, train_scores, test_scores = learning_curve(
#         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='accuracy'
#     )
    
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)
    
#     plt.grid()
    
#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                      train_scores_mean + train_scores_std, alpha=0.1,
#                      color="r")
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
#     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
#              label="Training score")
#     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
#              label="Cross-validation score")
    
#     plt.legend(loc="best")
#     return plt

## Plotting Learning Curves for Each Model
##  1. Random Forest Classifier

In [None]:
# plot_learning_curve(forest, "Learning Curves (Random Forest)", X_train_new, y_train)
# plt.show()

## 2. XGBoost Classifier

In [None]:
# plot_learning_curve(pipe_1, "Learning Curves (XGBoost)", X_train_new, y_train)
# plt.show()

## 3. Logistic Regression:

In [None]:
# plot_learning_curve(pipe_2, "Learning Curves (Logistic Regression)", X_train_new, y_train)
# plt.show()

In [None]:
# best_model=classifier

In [None]:
# test_pred=best_model.predict(test_data)

In [None]:
test_pred_decode = label_encoder_crime.inverse_transform(test_pred_xgb_1)
test_pred_decode



In [None]:
sample=pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/sample.csv")

In [None]:
sample.shape

In [None]:
sample.head()

In [None]:
test_data.shape

In [None]:
submission=pd.DataFrame({"ID": np.arange(1,5001),
                         "Crime_Category": test_pred_decode})
print(submission.head())
submission.to_csv('submission.csv',index=False)