# Steps in ML projects

1. Look at the big picture.
2. Get the data.
3. Discover and visualize the data to gain insights.
4. Prepare the data for Machine Learning algorithms.
5. Select a model and train it.
6. Fine-tune your model.
7. Present your solution.
8. Launch, monitor and maintain your system.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

# 2. Data Loading

In [None]:
train_df = pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/train.csv")
X = train_df.drop('Crime_Category',axis='columns')
y = train_df['Crime_Category']

# 3. EDA

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
# Descriptive statistics for numeric columns
print(train_df.describe())

# Value counts for categorical columns
print(train_df['Crime_Category'].value_counts())
print(train_df['Victim_Sex'].value_counts())

In [None]:
# Missing values
print(train_df.isnull().sum())

# Duplicate entries
print(train_df.duplicated().sum())

# Outliers detection
import seaborn as sns
import matplotlib.pyplot as plt

# Boxplot for Victim_Age
sns.boxplot(x=train_df['Victim_Age'])
plt.show()

# Boxplot for Time_Occurred
sns.boxplot(x=train_df['Time_Occurred'])
plt.show()


**Handling the negative and zero age values**

In [None]:
import numpy as np
from sklearn.impute import KNNImputer
import pandas as pd

# Assuming train_df is already loaded with the relevant data

# Replace non-positive values in 'Victim_Age' with NaN
train_df['Victim_Age'] = train_df['Victim_Age'].apply(lambda x: np.nan if x <= 0 else x)

# Initialize KNNImputer with a specified number of neighbors
knn_imputer = KNNImputer(n_neighbors=5)

# Apply KNN Imputer only to the 'Victim_Age' column
train_df[['Victim_Age']] = knn_imputer.fit_transform(train_df[['Victim_Age']])

# Verify the imputation
print(train_df['Victim_Age'].describe())

In [None]:
Q1 = train_df[['Victim_Age', 'Time_Occurred']].quantile(0.25)
Q3 = train_df[['Victim_Age', 'Time_Occurred']].quantile(0.75)
IQR = Q3 - Q1

outliers = (train_df[['Victim_Age', 'Time_Occurred']] < (Q1 - 1.5 * IQR)) | (train_df[['Victim_Age', 'Time_Occurred']] > (Q3 + 1.5 * IQR))
outliers = outliers.any(axis=1)
outliers_indices = train_df.index[outliers]
print(train_df.loc[outliers_indices])

In [None]:
median_age = train_df['Victim_Age'].median()
median_time = train_df['Time_Occurred'].median()

train_df.loc[outliers, 'Victim_Age'] = median_age
train_df.loc[outliers, 'Time_Occurred'] = median_time

In [None]:
# Summary statistics after imputation
print(train_df['Victim_Age'].describe())

In [None]:
# Convert to datetime
train_df['Date_Reported'] = pd.to_datetime(train_df['Date_Reported'], format='%m/%d/%Y %I:%M:%S %p')
train_df['Date_Occurred'] = pd.to_datetime(train_df['Date_Occurred'], format='%m/%d/%Y %I:%M:%S %p')

# Date distribution
train_df['Date_Reported'].hist(bins=30)
plt.title('Distribution of Reported Dates')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.show()

# Time of crime distribution
train_df['Time_Occurred'].hist(bins=24)
plt.title('Distribution of Time Occurred')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

# Seasonal trends
train_df['Month'] = train_df['Date_Occurred'].dt.month
train_df['Day_of_Week'] = train_df['Date_Occurred'].dt.day_name()

train_df.groupby('Month').size().plot(kind='bar')
plt.title('Crimes by Month')
plt.xlabel('Month')
plt.ylabel('Number of Crimes')
plt.show()

train_df.groupby('Day_of_Week').size().plot(kind='bar')
plt.title('Crimes by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Crimes')
plt.show()

In [None]:
# Crime Category distribution
sns.countplot(x='Crime_Category', data=train_df)
plt.xticks(rotation=90)
plt.title('Distribution of Crime Categories')
plt.xlabel('Crime Category')
plt.ylabel('Frequency')
plt.show()

# Cross-tabulation
cross_tab = pd.crosstab(train_df['Crime_Category'], train_df['Victim_Sex'])
print(cross_tab)

In [None]:
# Common Modus Operandi
print(train_df['Modus_Operandi'].value_counts().head(10))

# Weapon Analysis
print(train_df['Weapon_Description'].value_counts().head(10))

In [None]:
# Area-wise Crime Distribution
area_distribution = train_df['Area_Name'].value_counts()
area_distribution.plot(kind='bar')
plt.title('Crime Distribution by Area')
plt.xlabel('Area Name')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=90)
plt.show()

# Cross Street Analysis
print(train_df['Cross_Street'].value_counts().head(10))

In [None]:
# Age Distribution
train_df['Victim_Age'].hist(bins=30)
plt.title('Distribution of Victim Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# Gender Distribution
sns.countplot(x='Victim_Sex', data=train_df)
plt.title('Victim Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.show()

# Descent Distribution
sns.countplot(x='Victim_Descent', data=train_df)
plt.title('Victim Descent Distribution')
plt.xlabel('Descent')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Correlation Matrix
correlation_matrix = train_df[['Victim_Age', 'Time_Occurred', 'Area_ID', 'Reporting_District_no', 'Part 1-2']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Crime vs. Victim Attributes
sns.boxplot(x='Crime_Category', y='Victim_Age', data=train_df)
plt.title('Victim Age by Crime Category')
plt.xticks(rotation=90)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
sns.set()
train_df.hist(bins=50,figsize=(15,15))
# display histogram
plt.show()

In [None]:
y.head()

In [None]:
X.head()

In [None]:
X.info()

In [None]:
X.columns

In [None]:
X.nunique()

In [None]:
X['Victim_Sex'].unique()

In [None]:
X['Status'].unique()

In [None]:
y.unique()

In [None]:
X.isna().sum()

#To find out the missing values in X

In [None]:
X.info()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
#X_train=X_train.drop(['Area_Name','Part 1-2', 'Premise_Description', 'Weapon_Description', 'Status_Description'], axis=1)

In [None]:
#X_val=X_val.drop(['Area_Name','Part 1-2', 'Premise_Description', 'Weapon_Description', 'Status_Description'], axis=1)

# 4. Preparing the Data for ML Algorithm

In [None]:
#Approach 1: Encode dataframe- Impute numbers - Scale the dataframe
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

transform_pipeline = Pipeline([
    ('encoder', OrdinalEncoder()),  # Encode other categorical columns
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Transform the numerical and other categorical columns
transformed_df = transform_pipeline.fit_transform(X_train)
transformed_df = pd.DataFrame(transformed_df, columns=X_train.columns)

In [None]:
transformed_df.info()

In [None]:
transformed_df.head()

In [None]:
#Approach 2: Impute constants - Encode numbers - Scale the dataframe
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer # Not used in this version
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
import numpy as np

transformer_list = [
    ('cat_const1', SimpleImputer(missing_values=np.nan,strategy= 'constant',fill_value="Unknown"), ['Cross_Street']),
    ('cat_const3', SimpleImputer(missing_values=np.nan,strategy= 'constant',fill_value="Unknown"), ['Victim_Sex']),
    ('cat_const4', SimpleImputer(missing_values=np.nan,strategy= 'constant',fill_value="Unknown"), ['Victim_Descent']),
    
    ('cat_mode4', SimpleImputer(missing_values=np.nan,strategy= 'most_frequent'), ['Weapon_Used_Code']),
    
]

# Handle Modus Operandi using the same pipeline
# Join tokens in 'Modus_Operandi' into strings if they are lists

transform_pipeline2 = Pipeline([('ct', ColumnTransformer(transformers = transformer_list, remainder='passthrough',verbose_feature_names_out=False).set_output(transform="pandas")),
                               ('encoder2', OrdinalEncoder()),
                               ('std_scaler2', StandardScaler()),])

# Transform all columns (including 'Modus_Operandi')
transformed_df2 = transform_pipeline2.fit_transform(X_train) 
transformed_df2 = pd.DataFrame(transformed_df2, columns=X_train.columns) 

# Now X_train_final contains all your features and should have the same number of columns as X_train
transformed_df2.head()

In [None]:
transformed_df2.info()

Handled all the features till here. Next, labels. 

In [None]:
y.info()

In [None]:
#Not required
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded=label_encoder.fit_transform(y_train)
print(type(y_train_encoded))

In [None]:
#print(type(y_tr))

In [None]:
#print(y_tr.shape)

In [None]:
#y_tr = pd.DataFrame(y_tr, columns=['Label'])
#print(type(y_tr))

In [None]:
#y_tr.info()

# 5. Selecting and Training a ML Model

## 1.2 Selection of performance measure

* Regression
  *  Mean Squared Error (MSE) or
  *  Mean Absolute Error (MAE)
* Classification
  *  Precision
  *  Recall
  *  F1-score
  *  Accuracy

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
model = DummyClassifier(strategy = "most_frequent")
model.fit(transformed_df,y_train)

In [None]:
model2 = DecisionTreeClassifier()
model2.fit(transformed_df,y_train)

In [None]:
model3 = BaggingClassifier()
model3.fit(transformed_df, y_train) #0.83

In [None]:
model4 = AdaBoostClassifier()
model4.fit(transformed_df, y_train)

In [None]:
model5 = GradientBoostingClassifier()
model5.fit(transformed_df, y_train) #0.85

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 3) 
clf.fit(transformed_df, y_train) 

In [None]:
from sklearn.svm import SVC 

svm = SVC(kernel="rbf", gamma=0.5, C=1.0) 
svm.fit(transformed_df, y_train)

In [None]:
#model6 = MLPClassifier()
#model6.fit(transformed_df, y_train) Not converging

In [None]:
# model.predict(X_val.iloc[0:1])

In [None]:
#X_val.iloc[0:1]

In [None]:
from sklearn.metrics import precision_score,recall_score, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score


X_val_transformed = transform_pipeline.fit_transform(X_val) #Transforming the validation set
X_val_transformed = pd.DataFrame(X_val_transformed, columns=X_train.columns) #Converting array into dataframe
y_pred_val = model5.predict(X_val_transformed) #Predicting the validation set
print("Accuracy score:", accuracy_score(y_val, y_pred_val)) 
print("Precision Score:",precision_score(y_val, y_pred_val, average="weighted"))
print("Recall Score:",recall_score(y_val, y_pred_val, average="weighted"))
print('F1 Score:', f1_score(y_val, y_pred_val, average="weighted"))
print()
def display_scores(scores):
    print("CV Scores:", scores)
    print("Mean CV Score:", scores.mean())
    print("CV Scores Standard deviation:", scores.std())


score = cross_val_score (model5, X_val_transformed, y_pred_val, cv=5)
display_scores(score)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_val))

# 6. Finetuning the ML Model

# of SVM

In [None]:
'''from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
#0.1, 1, 10, 100, 1000
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100], 
            'gamma': [0.1, 0.01, 0.001, 0.0001], 
            'kernel': ['rbf']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

# fitting the model for grid search 
grid.fit(transformed_df, y_train)'''

'''Best combination: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
SVC(C=100, gamma=0.01) 

Accuracy score: 0.703030303030303
Precision Score: 0.6877296651877802
Recall Score: 0.703030303030303
F1 Score: 0.6676796862625628

Mean CV Score: 0.9318181818181817
CV Scores Standard deviation: 0.005525631069336515
'''

In [None]:
'''# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_)'''

In [None]:
#print(grid.cv_results_)

In [None]:
'''y_pred_val = grid.best_estimator_.predict(X_val_transformed) #Predicting the validation set
print("Accuracy score:", accuracy_score(y_val, y_pred_val)) 
print("Precision Score:",precision_score(y_val, y_pred_val, average="weighted"))
print("Recall Score:",recall_score(y_val, y_pred_val, average="weighted"))
print('F1 Score:', f1_score(y_val, y_pred_val, average="weighted"))
print()

from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("CV Scores:", scores)
    print("Mean CV Score:", scores.mean())
    print("CV Scores Standard deviation:", scores.std())


score = cross_val_score (grid.best_estimator_, X_val_transformed, y_pred_val, cv=5)
display_scores(score)'''

In [None]:
'''from sklearn.model_selection import RandomizedSearchCV

#feature_importances = grid.best_estimator_.feature_importances_'''

# of KNN

In [None]:
'''from sklearn.neighbors import KNeighborsClassifier

K = [] 
training = [] 
test = [] 
scores = {}

for k in range(2, 20): 
    clf = KNeighborsClassifier(n_neighbors = k) 
    clf.fit(transformed_df, y_train) 
  
    training_score = clf.score(transformed_df, y_train) 
    test_score = clf.score(X_val_transformed, y_val) 
    K.append(k) 
  
    training.append(training_score) 
    test.append(test_score) 
    scores[k] = [training_score, test_score] 

for keys, values in scores.items(): 
    print(keys, ':', values)
    
plt.scatter(K, training, color ='k') 
plt.scatter(K, test, color ='g') 
plt.show()'''

In [None]:
#The above code gives the latest value of parameter 'k'. Write the code to find the best params for KNN

In [None]:
'''y_pred_val = clf.predict(X_val_transformed) #Predicting the validation set
print("Accuracy score:", accuracy_score(y_val, y_pred_val)) 
print("Precision Score:",precision_score(y_val, y_pred_val, average="weighted"))
print("Recall Score:",recall_score(y_val, y_pred_val, average="weighted"))
print('F1 Score:', f1_score(y_val, y_pred_val, average="weighted"))
print()

from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("CV Scores:", scores)
    print("Mean CV Score:", scores.mean())
    print("CV Scores Standard deviation:", scores.std())


score = cross_val_score (clf, X_val_transformed, y_pred_val, cv=5)
display_scores(score)'''

'''Results: Converging graph. Latest K value is the best value
Accuracy score: 0.6686363636363636
Precision Score: 0.6158021426725532
Recall Score: 0.6686363636363636
F1 Score: 0.6207667871933233

Mean CV Score: 0.8877272727272729
CV Scores Standard deviation: 0.005124707431905378'''

# of GB Algo

In [None]:
'''
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
	'n_estimators': [300, 100, 50],
	'learning_rate': [0.1, 0.3, 0.01, 0.2],
}

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=2, scoring='accuracy', n_jobs=-1)

# Fit the model to the training data using GridSearchCV
grid_search.fit(transformed_df, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_best = best_model.predict(X_val_transformed)

# Evaluate the best model
accuracy_best = accuracy_score(y_val, y_pred_best)

# Print the results
print("Best Parameters:", best_params)
print(f"Best Model Accuracy: {accuracy_best}")'''

'''
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 300}
Best Model Accuracy: 0.7625757575757576


Accuracy score: 0.7625757575757576
Precision Score: 0.7519448838237669
Recall Score: 0.7625757575757576
F1 Score: 0.7459042305297008

CV Scores: [0.92727273 0.93409091 0.91818182 0.92727273 0.91893939]
Mean CV Score: 0.9251515151515151
CV Scores Standard deviation: 0.005934289705323058'''


In [None]:
'''y_pred_val = grid_search.best_estimator_.predict(X_val_transformed) #Predicting the validation set
print("Accuracy score:", accuracy_score(y_val, y_pred_val)) 
print("Precision Score:",precision_score(y_val, y_pred_val, average="weighted"))
print("Recall Score:",recall_score(y_val, y_pred_val, average="weighted"))
print('F1 Score:', f1_score(y_val, y_pred_val, average="weighted"))
print()

from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("CV Scores:", scores)
    print("Mean CV Score:", scores.mean())
    print("CV Scores Standard deviation:", scores.std())


score = cross_val_score (grid_search.best_estimator_, X_val_transformed, y_pred_val, cv=5)
display_scores(score)'''

# Present your solution

In [None]:
X_test= pd.read_csv("/kaggle/input/crime-cast-forecasting-crime-categories/test.csv")

In [None]:
X_test.shape

In [None]:
#X_test=X_test.drop(['Area_Name','Part 1-2', 'Premise_Description', 'Weapon_Description', 'Status_Description'], axis=1)
X_test_transformed = transform_pipeline.fit_transform(X_test) #Transforming the test set
X_test_transformed = pd.DataFrame(X_test_transformed, columns=X_train.columns) #Converting array into dataframe
y_pred = model5.predict(X_test_transformed) #Predicting the test set

In [None]:
#y_test_predictions = grid_search.best_estimator_.predict(X_test_transformed)

In [None]:
submission = pd.DataFrame({"ID": np.arange(1,5001), 
                           "Crime_Category": y_pred,
                          }
                         ) 

submission.to_csv('submission.csv',index=False)