In [243]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [244]:
df = pd.read_csv('data/Train.csv')
df.head()

Unnamed: 0,UniqueID,type_school,school_accreditation,gender,interest,residence,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college
0,30,Vocational,B,Female,Uncertain,Rural,50.0,6450000.0,78.4,85.86,True,False
1,536,Vocational,A,Male,Very Interested,Urban,54.0,5450000.0,70.3,,False,True
2,696,Vocational,A,Female,Very Interested,Urban,52.0,5520000.0,82.5,84.59,False,True
3,558,Academic,A,Male,Not Interested,Rural,41.0,3020000.0,50.6,77.18,True,False
4,837,Vocational,B,Male,Uncertain,Rural,52.0,6120000.0,65.8,84.55,True,False


In [245]:
categorical_cols = ["type_school", "school_accreditation", "gender", "interest", "residence", "parent_was_in_college"]
continuous_cols = [col for col in df.columns if col not in categorical_cols and col != 'UniqueID']

for col in categorical_cols:
    most_frequent = df[col].mode()[0]
    df[col].fillna(most_frequent, inplace=True)

for col in continuous_cols:
    mean_value = df[col].mean()
    df[col].fillna(mean_value, inplace=True)

df.isnull().sum()

UniqueID                 0
type_school              0
school_accreditation     0
gender                   0
interest                 0
residence                0
parent_age               0
parent_salary            0
house_area               0
average_grades           0
parent_was_in_college    0
will_go_to_college       0
dtype: int64

In [246]:
df.describe()

Unnamed: 0,UniqueID,parent_age,parent_salary,house_area,average_grades
count,800.0,800.0,800.0,800.0,800.0
mean,498.41875,52.061728,5425740.0,74.433108,86.131499
std,289.511545,3.25938,1355573.0,14.627427,3.184816
min,1.0,40.0,1660000.0,20.4,75.62
25%,242.75,50.0,4547500.0,65.675,84.04
50%,494.5,52.0,5425740.0,74.55,86.015
75%,751.25,54.0,6345000.0,83.8,87.6625
max,1000.0,65.0,10000000.0,120.0,97.0


In [247]:
df.info()
X_unique_ids = df["UniqueID"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   UniqueID               800 non-null    int64  
 1   type_school            800 non-null    object 
 2   school_accreditation   800 non-null    object 
 3   gender                 800 non-null    object 
 4   interest               800 non-null    object 
 5   residence              800 non-null    object 
 6   parent_age             800 non-null    float64
 7   parent_salary          800 non-null    float64
 8   house_area             800 non-null    float64
 9   average_grades         800 non-null    float64
 10  parent_was_in_college  800 non-null    bool   
 11  will_go_to_college     800 non-null    bool   
dtypes: bool(2), float64(4), int64(1), object(5)
memory usage: 64.2+ KB


In [248]:
X = df.drop(columns=["UniqueID", "will_go_to_college"])
y = df["will_go_to_college"]

In [249]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
columns_to_encode = ["type_school", "school_accreditation", "gender", "interest", "residence", "parent_was_in_college"]
for column in columns_to_encode:
    X[column] = le.fit_transform(X[column])

X.columns

Index(['type_school', 'school_accreditation', 'gender', 'interest',
       'residence', 'parent_age', 'parent_salary', 'house_area',
       'average_grades', 'parent_was_in_college'],
      dtype='object')

In [250]:
X.drop(columns=["parent_age"], inplace=True)

In [251]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Support Vector Classifier
svc_classifier = SVC()
svc_classifier.fit(X_train, y_train)
svc_predictions = svc_classifier.predict(X_test)
svc_accuracy = accuracy_score(y_test, svc_predictions)
svc_precision = precision_score(y_test, svc_predictions)
svc_recall = recall_score(y_test, svc_predictions)
svc_f1 = f1_score(y_test, svc_predictions)

# Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X, y)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_precision = precision_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_f1 = f1_score(y_test, dt_predictions)

# Logistic Regression Classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
lr_predictions = lr_classifier.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

# Print test metrics for each classifier
print("Support Vector Classifier:")
print("Accuracy:", svc_accuracy)
print("Precision:", svc_precision)
print("Recall:", svc_recall)
print("F1 Score:", svc_f1)
print("\nRandom Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print("\nDecision Tree Classifier:")
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1)
print("\nLogistic Regression Classifier:")
print("Accuracy:", lr_accuracy)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)

Support Vector Classifier:
Accuracy: 0.7
Precision: 0.6931818181818182
Recall: 0.7439024390243902
F1 Score: 0.7176470588235295

Random Forest Classifier:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Decision Tree Classifier:
Accuracy: 0.8125
Precision: 0.8333333333333334
Recall: 0.7926829268292683
F1 Score: 0.8125000000000001

Logistic Regression Classifier:
Accuracy: 0.5125
Precision: 0.5125
Recall: 1.0
F1 Score: 0.6776859504132231


In [252]:
import joblib

model_filename = 'rf_model_latest.pkl'
joblib.dump(rf_classifier, model_filename)

['rf_model_latest.pkl']