In [1]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
link = 'https://drive.google.com/file/d/1VWHKlhNASwSabG_MN9O0itOZ9ZaXUA4e/view?usp=sharing'

import pandas as pd

# to get the id part of the file
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('internship_dataset.csv')

df = pd.read_csv('internship_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,national_inv,lead_time,pieces_past_due,perf_6_month_avg,in_transit_qty,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,0,0.0,7.87,0.0,-99.0,0.0,0.0,0,0,0,1,0,0
1,1,2.0,9.0,0.0,0.99,0.0,0.0,0,0,0,1,0,0
2,2,2.0,7.87,0.0,-99.0,0.0,0.0,1,0,0,1,0,0
3,3,7.0,8.0,0.0,0.1,0.0,0.0,0,0,0,1,0,0
4,4,8.0,7.87,0.0,-99.0,0.0,0.0,1,0,0,1,0,0


In [3]:
df['went_on_backorder'].value_counts()

0    1676567
1      11293
Name: went_on_backorder, dtype: int64

In [4]:
#as we can clearly see there is imbalance of data in out testing set
#we have to balance this data
#we will do oversampeling which means increasing the number of instances in the minority class

In [5]:
from imblearn.over_sampling import SMOTE

# Separate features and target variable
X = df.drop("went_on_backorder", axis=1)
y = df["went_on_backorder"]
smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X, y)

print("Class distribution after oversampling:")
print(y_resampled.value_counts())


Class distribution after oversampling:
0    1676567
1    1676567
Name: went_on_backorder, dtype: int64


In [6]:
X_resampled.head(1)

Unnamed: 0.1,Unnamed: 0,national_inv,lead_time,pieces_past_due,perf_6_month_avg,in_transit_qty,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
0,0,0.0,7.87,0.0,-99.0,0.0,0.0,0,0,0,1,0


In [10]:
from sklearn.model_selection import train_test_split
x_tr, x_te, y_tr, y_te = train_test_split(X_resampled,y_resampled, random_state=0, test_size=0.2)

In [11]:
x_tr

Unnamed: 0.1,Unnamed: 0,national_inv,lead_time,pieces_past_due,perf_6_month_avg,in_transit_qty,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
647163,647163,8.000000,9.000000,0.0,0.990000,0.000000,0.000000,0,0,0,1,0
1706954,392174,6.748368,2.000000,0.0,0.874569,0.000000,0.000000,0,0,0,1,0
3269289,757424,-22.112737,8.318973,0.0,0.825170,0.000000,25.517870,0,0,0,1,0
2442862,287368,0.031824,7.995863,0.0,-2.869837,0.000000,1.936351,0,0,0,0,0
2676643,950303,0.918766,10.918766,0.0,0.486204,0.000000,0.540617,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2249467,1617643,0.287746,6.985778,0.0,0.771936,0.000000,0.000000,0,0,0,1,0
963395,963395,12.000000,8.000000,0.0,0.820000,0.000000,0.000000,0,0,0,1,0
2215104,1111367,8.844778,8.000000,0.0,0.982692,0.730846,0.000000,0,0,0,1,0
1484405,1484405,0.000000,3.000000,0.0,0.870000,0.000000,0.000000,0,0,0,1,0


In [13]:
y_tr.value_counts()

1    1341254
0    1341253
Name: went_on_backorder, dtype: int64

In [14]:
#since this is a classification project we will use a random forest and decision tree

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# **model-1**


In [16]:
model_1 = DecisionTreeClassifier()
model_1.fit(x_tr, y_tr)
y_pred_1 = model_1.predict(x_te)

# **model-2**

In [17]:
model_2 = RandomForestClassifier(n_estimators=40)
model_2.fit(x_tr, y_tr)
y_pred_2 = model_2.predict(x_te)

In [20]:
acc_score = accuracy_score(y_te, y_pred_1)
conf_matrix = confusion_matrix(y_te, y_pred_1)
print("Accuracy score for model 1",acc_score*100)
acc_score_2 = accuracy_score(y_te, y_pred_2)
conf_matrix = confusion_matrix(y_te, y_pred_2)
print("Accuracy Score for model 2",acc_score_2*100)

Accuracy score for model 1 99.1530314168681
Accuracy Score for model 2 99.40145565269518


In [24]:
import pickle
file_path = 'model1.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(model_1, file)

In [25]:
from google.colab import files
files.download('model1.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>