<a href="https://colab.research.google.com/github/Collins-nnaji/Data_Science/blob/main/Project_1_ShippingData_DecisionTree_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/Shipping_data.csv"
df = pd.read_csv(path)

In [None]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [None]:
# convert the categorical variables into numerical variables using label encoding
le = LabelEncoder()
df['Warehouse_block'] = le.fit_transform(df['Warehouse_block'])
df['Mode_of_Shipment'] = le.fit_transform(df['Mode_of_Shipment'])
df['Product_importance'] = le.fit_transform(df['Product_importance'])
df['Gender'] = le.fit_transform(df['Gender'])

In [None]:
# Standardizing the numerical variables
sc = StandardScaler()
df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
    'Prior_purchases', 'Discount_offered', 'Weight_in_gms']] = sc.fit_transform(df[['Customer_care_calls', 'Customer_rating',
                                                                                     'Cost_of_the_Product', 'Prior_purchases',
                                                                                     'Discount_offered', 'Weight_in_gms']])


In [None]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,3,0,-0.047711,-0.700755,-0.690722,-0.372735,1,0,1.889983,-1.46824,1
1,2,4,0,-0.047711,1.421578,0.120746,-1.029424,1,1,2.815636,-0.333893,1
2,3,0,0,-1.799887,-0.700755,-0.565881,0.283954,1,1,2.136824,-0.159002,1
3,4,1,0,-0.923799,0.006689,-0.711529,0.283954,2,1,-0.208162,-1.502484,1
4,5,2,0,-1.799887,-0.700755,-0.545074,-0.372735,2,0,2.013404,-0.703244,1


In [None]:
# Split the dataset into training and testing sets
X = df.drop('Reached.on.Time_Y.N', axis=1)
y = df['Reached.on.Time_Y.N']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Build the decision tree model
dt = DecisionTreeClassifier()

# Hyperparameter tuning using 10-fold cross-validation
params = {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 10)}
dt_cv = GridSearchCV(dt, params, cv=10)
dt_cv.fit(X_train, y_train)
print('Best hyperparameters for Decision Tree:', dt_cv.best_params_)

# Evaluate the model on the test set
y_pred = dt_cv.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Best hyperparameters for Decision Tree: {'criterion': 'gini', 'max_depth': 1}
Accuracy: 0.6945454545454546
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       895
           1       1.00      0.49      0.65      1305

    accuracy                           0.69      2200
   macro avg       0.79      0.74      0.69      2200
weighted avg       0.83      0.69      0.68      2200

[[895   0]
 [672 633]]


In [None]:
# Build the random forest model
rf = RandomForestClassifier()

# Hyperparameter tuning using 10-fold cross-validation
params = {'n_estimators': [10, 50, 100, 200], 'max_depth': range(1, 10)}
rf_cv = GridSearchCV(rf, params, cv=10)
rf_cv.fit(X_train, y_train)
print('Best hyperparameters for Random Forest:', rf_cv.best_params_)

# Evaluate the model on the test set
y_pred = rf_cv.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


Best hyperparameters for Random Forest: {'max_depth': 3, 'n_estimators': 100}
Accuracy: 0.6913636363636364
              precision    recall  f1-score   support

           0       0.57      0.98      0.72       895
           1       0.97      0.49      0.65      1305

    accuracy                           0.69      2200
   macro avg       0.77      0.74      0.69      2200
weighted avg       0.81      0.69      0.68      2200

[[877  18]
 [661 644]]


In [None]:
# Build the logistic regression model
lr = LogisticRegression()

# Fit the model to the training data
lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.649090909090909
              precision    recall  f1-score   support

           0       0.56      0.65      0.60       895
           1       0.73      0.65      0.69      1305

    accuracy                           0.65      2200
   macro avg       0.64      0.65      0.64      2200
weighted avg       0.66      0.65      0.65      2200

[[584 311]
 [461 844]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import numpy as np



# Build the SVM model
svm = SVC()

# Fit the model to the training data
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Evaluate the performance of the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6863636363636364
              precision    recall  f1-score   support

           0       0.57      0.91      0.70       895
           1       0.90      0.53      0.67      1305

    accuracy                           0.69      2200
   macro avg       0.73      0.72      0.69      2200
weighted avg       0.77      0.69      0.68      2200

[[816  79]
 [611 694]]
