In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Replied Last Time,Time Since Last Message,Who Texted First,Response Length,Left on Read,Seen Story No Reply,Emoji Use,Only You Start Convos,Last Reaction,Mood,Decision
0,Yes,1,Them,25,No,No,A lot,No,LOL,Confident,Text them!
1,Yes,0,You,15,No,No,Sometimes,No,Haha,Meh,Text them!
2,Yes,5,Them,8,No,No,Sometimes,No,Dry response,Confident,Wait a bit.
3,Yes,3,You,12,No,Yes,A lot,Yes,Haha,Meh,Wait a bit.
4,Yes,2,You,3,No,Yes,Never,Yes,Dry response,Desperate,Try a meme first.


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Replied Last Time        100 non-null    object
 1   Time Since Last Message  100 non-null    int64 
 2   Who Texted First         100 non-null    object
 3   Response Length          100 non-null    int64 
 4   Left on Read             100 non-null    object
 5   Seen Story No Reply      100 non-null    object
 6   Emoji Use                100 non-null    object
 7   Only You Start Convos    100 non-null    object
 8   Last Reaction            100 non-null    object
 9   Mood                     100 non-null    object
 10  Decision                 100 non-null    object
dtypes: int64(2), object(9)
memory usage: 8.7+ KB


In [4]:
for feature in df.columns.tolist():
    print(f"{feature}: {df[feature].unique()}")

Replied Last Time: ['Yes' 'No']
Time Since Last Message: [ 1  0  5  3  2  4 10 14  7  6  8  9 12 11 13]
Who Texted First: ['Them' 'You']
Response Length: [25 15  8 12  3  0  1  5 30 20 10 40  2 18  7  4  6 22 14  9 11]
Left on Read: ['No' 'Yes']
Seen Story No Reply: ['No' 'Yes']
Emoji Use: ['A lot' 'Sometimes' 'Never']
Only You Start Convos: ['No' 'Yes']
Last Reaction: ['LOL' 'Haha' 'Dry response' 'Ignored']
Mood: ['Confident' 'Meh' 'Desperate']
Decision: ['Text them!' 'Wait a bit.' 'Try a meme first.' 'Abort mission.']


In [5]:
print(df["Decision"].value_counts(normalize=True))  # Check class distribution

Decision
Text them!           0.33
Wait a bit.          0.27
Try a meme first.    0.20
Abort mission.       0.20
Name: proportion, dtype: float64


In [6]:
df = df.astype({"Time Since Last Message" : int, "Response Length": int}) # changing data type of the int columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Replied Last Time        100 non-null    object
 1   Time Since Last Message  100 non-null    int64 
 2   Who Texted First         100 non-null    object
 3   Response Length          100 non-null    int64 
 4   Left on Read             100 non-null    object
 5   Seen Story No Reply      100 non-null    object
 6   Emoji Use                100 non-null    object
 7   Only You Start Convos    100 non-null    object
 8   Last Reaction            100 non-null    object
 9   Mood                     100 non-null    object
 10  Decision                 100 non-null    object
dtypes: int64(2), object(9)
memory usage: 8.7+ KB


In [7]:
y = df["Decision"]
X = df.drop("Decision", axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, shuffle = True, random_state = 42)

In [8]:
catogorical_columns = X_train.select_dtypes(include = ['object']).columns.tolist()
preprocessor = ColumnTransformer([
    ('one_hot', OneHotEncoder(handle_unknown="ignore", sparse_output= False), catogorical_columns)
], remainder = 'passthrough')

In [9]:
encoded_X = preprocessor.fit_transform(X_train)
feature_names = preprocessor.get_feature_names_out()
transformed_X = pd.DataFrame(data = encoded_X, columns= feature_names)
transformed_X.head()

Unnamed: 0,one_hot__Replied Last Time_No,one_hot__Replied Last Time_Yes,one_hot__Who Texted First_Them,one_hot__Who Texted First_You,one_hot__Left on Read_No,one_hot__Left on Read_Yes,one_hot__Seen Story No Reply_No,one_hot__Seen Story No Reply_Yes,one_hot__Emoji Use_A lot,one_hot__Emoji Use_Never,...,one_hot__Only You Start Convos_Yes,one_hot__Last Reaction_Dry response,one_hot__Last Reaction_Haha,one_hot__Last Reaction_Ignored,one_hot__Last Reaction_LOL,one_hot__Mood_Confident,one_hot__Mood_Desperate,one_hot__Mood_Meh,remainder__Time Since Last Message,remainder__Response Length
0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0,3.0
1,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,12.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,10.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,5.0,0.0
4,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0,15.0


In [10]:
encoder = OneHotEncoder(sparse_output= False)
encoded_y = encoder.fit_transform(y_train.values.reshape(-1, 1))

In [11]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(5, 15),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'ccp_alpha': [0.0, 0.001, 0.01, 0.1]
}
grid_search= GridSearchCV(tree.DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring ="accuracy")
grid_search.fit(transformed_X, encoded_y)

In [12]:
print("Best parameters:", grid_search.best_params_)

Best parameters: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [13]:
model = tree.DecisionTreeClassifier(random_state=42, criterion = 'entropy', max_depth =  5, min_samples_leaf=1, min_samples_split = 2)
model.fit(transformed_X, encoded_y)

In [14]:
X_test.head()

Unnamed: 0,Replied Last Time,Time Since Last Message,Who Texted First,Response Length,Left on Read,Seen Story No Reply,Emoji Use,Only You Start Convos,Last Reaction,Mood
83,No,7,You,0,Yes,No,Never,Yes,Ignored,Meh
53,Yes,14,You,1,No,Yes,Never,Yes,Dry response,Desperate
70,Yes,0,You,10,No,No,Sometimes,No,Haha,Meh
45,Yes,4,You,3,No,Yes,Never,Yes,Dry response,Meh
44,No,11,You,0,Yes,Yes,Never,Yes,Ignored,Desperate


In [15]:
encoded_y_test = encoder.transform(y_test.values.reshape(-1, 1))
encoded_X_test = preprocessor.transform(X_test)
transformed_X_test = pd.DataFrame(data = encoded_X_test, columns= feature_names)

In [16]:
feature_names

array(['one_hot__Replied Last Time_No', 'one_hot__Replied Last Time_Yes',
       'one_hot__Who Texted First_Them', 'one_hot__Who Texted First_You',
       'one_hot__Left on Read_No', 'one_hot__Left on Read_Yes',
       'one_hot__Seen Story No Reply_No',
       'one_hot__Seen Story No Reply_Yes', 'one_hot__Emoji Use_A lot',
       'one_hot__Emoji Use_Never', 'one_hot__Emoji Use_Sometimes',
       'one_hot__Only You Start Convos_No',
       'one_hot__Only You Start Convos_Yes',
       'one_hot__Last Reaction_Dry response',
       'one_hot__Last Reaction_Haha', 'one_hot__Last Reaction_Ignored',
       'one_hot__Last Reaction_LOL', 'one_hot__Mood_Confident',
       'one_hot__Mood_Desperate', 'one_hot__Mood_Meh',
       'remainder__Time Since Last Message', 'remainder__Response Length'],
      dtype=object)

In [17]:
predict = model.predict(transformed_X_test)
print(predict)

[[1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]]


In [18]:
acc = accuracy_score(encoded_y_test, predict)
print(acc)

0.85


In [19]:
predict = model.predict(transformed_X)
acc = accuracy_score(encoded_y, predict)
print(acc)

1.0


In [20]:
rf = RandomForestClassifier(class_weight="balanced", n_estimators=100, random_state=42)
rf.fit(transformed_X, encoded_y)

In [21]:
predict = rf.predict(transformed_X_test)
decoded_prediction = encoder.inverse_transform(predict)
acc = accuracy_score(encoded_y_test, predict)
print(acc)
print(decoded_prediction)

0.95
[['Abort mission.']
 ['Try a meme first.']
 ['Text them!']
 ['Try a meme first.']
 ['Abort mission.']
 ['Abort mission.']
 ['Wait a bit.']
 ['Abort mission.']
 ['Text them!']
 ['Text them!']
 ['Try a meme first.']
 ['Text them!']
 ['Text them!']
 ['Text them!']
 ['Wait a bit.']
 ['Try a meme first.']
 ['Text them!']
 ['Abort mission.']
 ['Try a meme first.']
 ['Abort mission.']]


In [22]:
import joblib

# Save preprocessors
joblib.dump(preprocessor, "preprocessorX.pkl")
joblib.dump(encoder, "label_encoder_y.pkl")

# Save trained model
joblib.dump(rf, "RandomForestClassifier_model.pkl")

['RandomForestClassifier_model.pkl']