## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif


## Data Exploration and Preprocessing

### Reading data from files

In [2]:
df_registration = pd.read_csv('registration.csv')
df_sample_submission = pd.read_csv("sample_submission.csv")
df_test = pd.read_csv("test.csv")
df_train = pd.read_csv("train.csv")

In [3]:
# # merge them to make cleaning easier and splite them when needed
# df = pd.concat([df_train, df_test]).reset_index(drop=True)

### Exploring features

<ul>
<li><code>Student ID</code>: String - A unique identifier for each student.</li>
<li><code>Age</code>: Int - The age of the student.</li>
<li><code>Gender</code>: String - Gender of the student.</li>
<li><code>Home Region</code>: String - Student's home region.</li>
<li><code>Home City</code>: String - Student's home city.</li>
<li><code>Program ID</code>: String - Unique identifier for each program.</li>
<li><code>Program Main Category Code</code>: String - Main category of the program (Encoded).</li>
<li><code>Program Sub Category Code</code>: String - Sub-category of the program (Encoded).</li>
<li><code>Technology Type</code>: String - Type of technology used in the program.</li>
<li><code>Program Skill Level</code>: String - Skill level of the program.</li>
<li><code>Program Presentation Method</code>: String - Presentation method of the program (in-person or online).</li>
<li><code>Program Start Date</code>: Date - The date the program started.</li>
<li><code>Program End Date</code>: Date - The date the program ended.</li>
<li><code>Program Days</code>: Int - Count of days in the program.</li>
<li><code>Completed Degree</code>: String - Indicates if the student completed a university/college degree.</li>
<li><code>Level of Education</code>: String - Highest university/college degree received by the student.</li>
<li><code>Education Specialty</code>: String - University/college degree specialty.</li>
<li><code>College</code>: String - The student's university/college.</li>
<li><code>University Degree Score</code>: String - The score of the student in university/college.</li>
<li><code>University Degree Score System</code>: String - The scoring system used for the student's university/college score.</li>
<li><code>Employment Status</code>: String - Current employment status of the student.</li>
<li><code>Job Type</code>: String - Type of employment for the student.</li>
<li><code>Still Working</code>: String - Indicates if the student is currently working.</li>
<li><code>Y (Target)</code>: Bool - Indicates if the student completed the program by achieving the minimum attendance percentage.(1 = Did not complete the program, 0 = Successfully completed the program)</li>
</ul>

In [4]:
#since we have lots of columns and we want to look at them all, we must do the following
pd.options.display.max_columns = None

In [5]:
df_registration.head(2)

Unnamed: 0,Student ID,PCRF,GRST,CAUF,INFA,ABIR,SERU,TOSL,APMR,DTFH,QWLM,N/A,Total Regestration
0,0005f921-db87-47a3-af19-000332af236b,6,0,1,3,0,0,0,3,0,0,6,19
1,000f66b3-6ad7-4a6c-9f1f-0d34b005c5e6,0,0,7,0,0,0,0,1,0,0,0,8


In [6]:
df_sample_submission.head(2)

Unnamed: 0,ID,Y
0,1,1
1,2,1


In [7]:
df_train.head(2)

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Start Date,Program End Date,Program Days,Completed Degree,Level of Education,Education Speaciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working,Y
0,4f14c50d-162e-4a15-9cf0-ec129c33bcf0,37.0,ذكر,منطقة الرياض,الرياض,453686d8-4023-4506-b2df-fac8b059ac26,PCRF,PCRF,,,حضوري,2023-05-28,2023-06-08,12,نعم,البكالوريوس,هندسة حاسب الالي,,2.44,4.0,غير موظف,,,0
1,0599d409-876b-41a5-af05-749ef0e77d32,21.0,ذكر,منطقة عسير,خميس مشيط,cc8e4e42-65d5-4fa1-82f9-6c6c2d508b60,APMR,SWPS,,متوسط,حضوري,2023-04-02,2023-04-06,5,نعم,البكالوريوس,الإذاعة والتلفزيون والفيلم,الفنون والعلوم الإنسانية,5.0,5.0,طالب,,,0


In [8]:
df_test.head(2)

Unnamed: 0,Student ID,Age,Gender,Home Region,Home City,Program ID,Program Main Category Code,Program Sub Category Code,Technology Type,Program Skill Level,Program Presentation Method,Program Start Date,Program End Date,Program Days,Completed Degree,Level of Education,Education Speaciality,College,University Degree Score,University Degree Score System,Employment Status,Job Type,Still Working
0,d8524ed6-a1b2-4f6f-9041-66eebcab899a,23.0,أنثى,منطقة الرياض,الرياض,451d680f-e067-41c0-a998-6b3da3963bca,CAUF,SWPS,تقليدية,متوسط,عن بعد,2023-10-08,2023-10-12,5,لا,البكالوريوس,علوم الحاسب الالي,,3.72,4.0,خريج,,
1,5490b973-d85e-4b26-93a8-ecc11c47a5a0,31.0,أنثى,منطقة الرياض,الرياض,4957a013-46a7-419c-93d5-ebf3741ab2a1,PCRF,PCRF,تقليدية,مبتدئ,عن بعد,2023-07-16,2023-08-03,19,لا,البكالوريوس,تقنية المعلومات,تكنولوجيا الاتصالات والمعلومات,2.0,4.0,,,


#### More details about the Data Frames

In [9]:
df_registration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6171 entries, 0 to 6170
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Student ID          6171 non-null   object
 1   PCRF                6171 non-null   int64 
 2   GRST                6171 non-null   int64 
 3   CAUF                6171 non-null   int64 
 4   INFA                6171 non-null   int64 
 5   ABIR                6171 non-null   int64 
 6   SERU                6171 non-null   int64 
 7   TOSL                6171 non-null   int64 
 8   APMR                6171 non-null   int64 
 9   DTFH                6171 non-null   int64 
 10  QWLM                6171 non-null   int64 
 11  N/A                 6171 non-null   int64 
 12  Total Regestration  6171 non-null   int64 
dtypes: int64(12), object(1)
memory usage: 626.9+ KB


In [10]:
df_sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   ID      9 non-null      int64
 1   Y       9 non-null      int64
dtypes: int64(2)
memory usage: 272.0 bytes


In [11]:
df_test.info(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 818 entries, 0 to 817
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Student ID                      818 non-null    object 
 1   Age                             804 non-null    float64
 2   Gender                          818 non-null    object 
 3   Home Region                     817 non-null    object 
 4   Home City                       817 non-null    object 
 5   Program ID                      818 non-null    object 
 6   Program Main Category Code      818 non-null    object 
 7   Program Sub Category Code       703 non-null    object 
 8   Technology Type                 454 non-null    object 
 9   Program Skill Level             618 non-null    object 
 10  Program Presentation Method     818 non-null    object 
 11  Program Start Date              818 non-null    object 
 12  Program End Date                818 

In [12]:
df_train.info(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6548 entries, 0 to 6547
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Student ID                      6548 non-null   object 
 1   Age                             6456 non-null   float64
 2   Gender                          6548 non-null   object 
 3   Home Region                     6546 non-null   object 
 4   Home City                       6546 non-null   object 
 5   Program ID                      6548 non-null   object 
 6   Program Main Category Code      6548 non-null   object 
 7   Program Sub Category Code       5613 non-null   object 
 8   Technology Type                 3566 non-null   object 
 9   Program Skill Level             4902 non-null   object 
 10  Program Presentation Method     6548 non-null   object 
 11  Program Start Date              6548 non-null   object 
 12  Program End Date                65

### Proccesing

##### Checking duplicates and null values in review dataset 

In [13]:
print(f"Number of duplicates in df_registration is: {df_registration.duplicated().sum()}")
print(f"Number of duplicates in df_sample_submission is: {df_sample_submission.duplicated().sum()}")
print(f"Number of duplicates in df_test is: {df_test.duplicated().sum()}")
print(f"Number of duplicates in df_train is: {df_train.duplicated().sum()}")

Number of duplicates in df_registration is: 0
Number of duplicates in df_sample_submission is: 0
Number of duplicates in df_test is: 4
Number of duplicates in df_train is: 48


In [14]:
# dropping duplicates
df_test = df_test.drop_duplicates()
print(f"Number of duplicates in df_test is: {df_test.duplicated().sum()}")

df_train = df_train.drop_duplicates()
print(f"Number of duplicates in df_train is: {df_train.duplicated().sum()}")

Number of duplicates in df_test is: 0
Number of duplicates in df_train is: 0


In [15]:
# checking if there are null values
df_registration.isnull().sum()

Student ID            0
PCRF                  0
GRST                  0
CAUF                  0
INFA                  0
ABIR                  0
SERU                  0
TOSL                  0
APMR                  0
DTFH                  0
QWLM                  0
N/A                   0
Total Regestration    0
dtype: int64

In [16]:
df_sample_submission.isnull().sum()

ID    0
Y     0
dtype: int64

In [17]:
df_test.isnull().sum()

Student ID                          0
Age                                14
Gender                              0
Home Region                         1
Home City                           1
Program ID                          0
Program Main Category Code          0
Program Sub Category Code         113
Technology Type                   363
Program Skill Level               200
Program Presentation Method         0
Program Start Date                  0
Program End Date                    0
Program Days                        0
Completed Degree                    0
Level of Education                  3
Education Speaciality              37
College                           490
University Degree Score            11
University Degree Score System     11
Employment Status                  70
Job Type                          578
Still Working                     578
dtype: int64

In [18]:
df_train.isnull().sum()

Student ID                           0
Age                                 87
Gender                               0
Home Region                          2
Home City                            2
Program ID                           0
Program Main Category Code           0
Program Sub Category Code          920
Technology Type                   2958
Program Skill Level               1645
Program Presentation Method          0
Program Start Date                   0
Program End Date                     0
Program Days                         0
Completed Degree                     0
Level of Education                  22
Education Speaciality              272
College                           3862
University Degree Score             76
University Degree Score System      76
Employment Status                  557
Job Type                          4535
Still Working                     4535
Y                                    0
dtype: int64

In [19]:
df_test.drop(columns=['Education Speaciality',"Home City", "Home Region",'Still Working','Job Type',"College",'Technology Type',"Program Skill Level","Program Sub Category Code"], inplace=True)
df_train.drop(columns=['Education Speaciality',"Home City", "Home Region",'Still Working','Job Type',"College",'Technology Type',"Program Skill Level","Program Sub Category Code"], inplace=True)
#Education Speaciality has to much categories

In [20]:
df_train.isnull().sum()

Student ID                          0
Age                                87
Gender                              0
Program ID                          0
Program Main Category Code          0
Program Presentation Method         0
Program Start Date                  0
Program End Date                    0
Program Days                        0
Completed Degree                    0
Level of Education                 22
University Degree Score            76
University Degree Score System     76
Employment Status                 557
Y                                   0
dtype: int64

In [21]:
df_test.isnull().sum()

Student ID                         0
Age                               14
Gender                             0
Program ID                         0
Program Main Category Code         0
Program Presentation Method        0
Program Start Date                 0
Program End Date                   0
Program Days                       0
Completed Degree                   0
Level of Education                 3
University Degree Score           11
University Degree Score System    11
Employment Status                 70
dtype: int64

In [22]:
#dropping null values
# df_test = df_test.dropna()
df_test.Age =  df_test.Age.fillna(df_test.Age.mean())
df_test['University Degree Score'] =  df_test['University Degree Score'].fillna(df_test['University Degree Score'].mean())
df_test['University Degree Score System'] =  df_test['University Degree Score System'].fillna(df_test['University Degree Score System'].mean())
df_test = df_test.fillna('Unspecified')
df_test.isnull().sum()

Student ID                        0
Age                               0
Gender                            0
Program ID                        0
Program Main Category Code        0
Program Presentation Method       0
Program Start Date                0
Program End Date                  0
Program Days                      0
Completed Degree                  0
Level of Education                0
University Degree Score           0
University Degree Score System    0
Employment Status                 0
dtype: int64

In [23]:
#dropping null values
df_train = df_train.dropna()
df_train.isnull().sum()

Student ID                        0
Age                               0
Gender                            0
Program ID                        0
Program Main Category Code        0
Program Presentation Method       0
Program Start Date                0
Program End Date                  0
Program Days                      0
Completed Degree                  0
Level of Education                0
University Degree Score           0
University Degree Score System    0
Employment Status                 0
Y                                 0
dtype: int64

### Further proccesing (encoding)

In [24]:
# turning bianary categories into 1 and 0 
print(df_test['Gender'].value_counts())
df_test['Gender'] = df_test['Gender'].replace({'ذكر': 1, 'أنثى': 0})

print(df_train['Gender'].value_counts())
df_train['Gender'] = df_train['Gender'].replace({'ذكر': 1, 'أنثى': 0})


print(df_test['Program Presentation Method'].value_counts())
df_test['Program Presentation Method'] = df_test['Program Presentation Method'].replace({'حضوري': 1, 'عن بعد': 0})

print(df_train['Program Presentation Method'].value_counts())
df_train['Program Presentation Method'] = df_train['Program Presentation Method'].replace({'حضوري': 1, 'عن بعد': 0})


print(df_test['Completed Degree'].value_counts())
df_test['Completed Degree'] = df_test['Completed Degree'].replace({'نعم': 1, 'لا': 0})

print(df_train['Completed Degree'].value_counts())
df_train['Completed Degree'] = df_train['Completed Degree'].replace({'نعم': 1, 'لا': 0})


Gender
أنثى    482
ذكر     332
Name: count, dtype: int64
Gender
أنثى    3295
ذكر     2633
Name: count, dtype: int64
Program Presentation Method
حضوري     612
عن بعد    202
Name: count, dtype: int64
Program Presentation Method
حضوري     4535
عن بعد    1393
Name: count, dtype: int64
Completed Degree
نعم    636
لا     178
Name: count, dtype: int64
Completed Degree
نعم    4706
لا     1222
Name: count, dtype: int64


## Converting Categorical data into numbers

In [25]:

print(df_train['Employment Status'].value_counts())
df_train['Employment Status'] = df_train['Employment Status'].replace({'موظف': 1, 'غير موظف': 2, 'خريج': 3, "طالب": 4, "موظف - طالب": 4, 'عمل حر':2,'Unspecified': 5})

print(df_train['Level of Education'].value_counts())
df_train['Level of Education'] = df_train['Level of Education'].replace({'ثانوي': 1, 'الدبلوم': 2, 'البكالوريوس': 3, "الماجستير": 4, "الدكتوراه": 4,'Unspecified':0,'متوسط':0})

print(df_train['Program Main Category Code'].value_counts())
df_train['Program Main Category Code'] = df_train['Program Main Category Code'].replace({'CAUF': 1, 'PCRF': 2, 'APMR': 3, "TOSL": 4, "GRST": 5, 'ABIR': 6, "INFA":7, 'SERU': 8, 'QWLM':8,'DTFH':8})


#for test dataset

print(df_test['Employment Status'].value_counts())
df_test['Employment Status'] = df_test['Employment Status'].replace({'موظف': 1, 'غير موظف': 2, 'خريج': 3, "طالب": 4, "موظف - طالب": 4, 'عمل حر':2,'Unspecified': 5})

print(df_test['Level of Education'].value_counts())
df_test['Level of Education'] = df_test['Level of Education'].replace({'ثانوي': 1, 'الدبلوم': 2, 'البكالوريوس': 3, "الماجستير": 4, "الدكتوراه": 4,'Unspecified':0,'متوسط':0})

print(df_test['Program Main Category Code'].value_counts())
df_test['Program Main Category Code'] = df_test['Program Main Category Code'].replace({'CAUF': 1, 'PCRF': 2, 'APMR': 3, "TOSL": 4, "GRST": 5, 'ABIR': 6, "INFA":7, 'SERU': 8, 'QWLM':8,'DTFH':8})


Employment Status
موظف           2777
غير موظف       1106
طالب            945
خريج            944
موظف - طالب      88
عمل حر           68
Name: count, dtype: int64
Level of Education
البكالوريوس    4909
الماجستير       460
الدبلوم         283
ثانوي           247
الدكتوراه        29
Name: count, dtype: int64
Program Main Category Code
CAUF    2157
PCRF    1512
APMR    1153
TOSL     452
GRST     205
ABIR     182
INFA     168
SERU      39
QWLM      30
DTFH      30
Name: count, dtype: int64
Employment Status
موظف           347
طالب           132
غير موظف       131
خريج           119
Unspecified     70
موظف - طالب     11
عمل حر           4
Name: count, dtype: int64
Level of Education
البكالوريوس    657
الماجستير       68
الدبلوم         44
ثانوي           36
الدكتوراه        5
Unspecified      3
متوسط            1
Name: count, dtype: int64
Program Main Category Code
CAUF    283
PCRF    215
APMR    173
TOSL     55
GRST     32
ABIR     23
INFA     21
SERU      6
QWLM      4
DTFH      2
Name: 

In [26]:
# Create the new column GPA_percentage to enlminate the confustion of GPA systems :)
df_test['GPA_percentage'] = df_test['University Degree Score'] / df_test['University Degree Score System']
# Drop the original two columns
df_test.drop(['University Degree Score', 'University Degree Score System'], axis=1, inplace=True)

# Create the new column GPA_percentage to enlminate the confustion of GPA systems :)
df_train['GPA_percentage'] = df_train['University Degree Score'] / df_train['University Degree Score System']
# Drop the original two columns
df_train.drop(['University Degree Score', 'University Degree Score System'], axis=1, inplace=True)

In [27]:
# continueing droping some unnecessary columns 
df_train.drop(['Student ID', 'Program ID','Program Start Date','Program End Date'], axis=1, inplace=True)
# continueing droping some unnecessary columns 
df_test.drop(['Student ID', 'Program ID','Program Start Date','Program End Date'], axis=1, inplace=True)

In [28]:
df_train.shape

(5928, 10)

In [29]:
# # One hot encoding for categorical data

# df_all = pd.concat([df_train, df_test])
# df_all = pd.get_dummies(df_all, columns=['Home Region', 'Home City', 'Program Main Category Code'
#                                 ,  'Level of Education','Education Speaciality',
#                                  'Employment Status'])



# # df_train = df_all.iloc[:5742, :]
# # df_test = df_all.iloc[5742:, :]
# df_test = df_all[df_all.Y.isnull()]
# # One hot encoding for categorical data
# df_train = df_all[df_all.Y.isnull() == False] 

# df_test.shape




# print(df_train['Employment Status'].value_counts())
# df_train['Employment Status'] = df_train['Employment Status'].replace({'موظف': 1, 'غير موظف': 2, 'خريج': 3, "طالب": 4, "موظف - طالب": 5, 'عمل حر':6})

# print(df_train['Level of Education'].value_counts())
# df_train['Level of Education'] = df_train['Level of Education'].replace({'ثانوي': 1, 'الدبلوم': 2, 'البكالوريوس': 3, "الماجستير": 4, "الدكتوراه": 5})

# print(df_train['Program Main Category Code'].value_counts())
# df_train['Program Main Category Code'] = df_train['Program Main Category Code'].replace({'CAUF': 1, 'PCRF': 2, 'APMR': 3, "TOSL": 4, "GRST": 5, 'ABIR': 6, "INFA":7, 'SERU': 8, 'QWLM':9,'DTFH':10})


### DATA after cleaning, encoding etc...

## Model Building

#### Spliting the data

In [30]:
y = df_train['Y']
X = df_train.drop('Y', axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Logistic Regression

In [31]:
# Scaling the data for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Initialize the logistic regression model and train it on the scaled training data
LG_clf = LogisticRegression()
LG_clf.fit(X_train_scaled, Y_train)

# Make predictions on the test data
LR_predictions = LG_clf.predict(X_test_scaled)

print(classification_report(Y_test, LR_predictions))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       997
           1       0.59      0.44      0.50       189

    accuracy                           0.86      1186
   macro avg       0.74      0.69      0.71      1186
weighted avg       0.85      0.86      0.85      1186



#### Decision Tree

In [33]:
# Initialize the decision tree classifier
dt_clf = DecisionTreeClassifier()

# Train the classifier on the training data
dt_clf.fit(X_train, Y_train)

# Make predictions on the test data
DT_predictions = dt_clf.predict(X_test)

print(classification_report(Y_test, DT_predictions))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       997
           1       0.51      0.57      0.54       189

    accuracy                           0.85      1186
   macro avg       0.72      0.73      0.72      1186
weighted avg       0.85      0.85      0.85      1186



## 4. Model Improvement

### a. Experiment with different machine learning algorithms and techniques to improve model performance.



In [34]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy}")

Logistic Regression Accuracy: 0.866779089376054


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Support Vector Machine Accuracy: 0.8465430016863407
Random Forest Accuracy: 0.8819561551433389



### b. Fine-tune hyperparameters of the selected models to optimize performance.


In [35]:
from sklearn.model_selection import RandomizedSearchCV
# Feature Engineering
# Apply feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Create a Random Forest classifier
rf = RandomForestClassifier()

# Perform Randomized Search to find the best hyperparameters
random_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train_scaled, Y_train)

# Get the best Random Forest model
best_rf = random_search.best_estimator_

# Evaluate the best Random Forest model
y_pred_rf = best_rf.predict(X_test_scaled)
accuracy_rf = accuracy_score(Y_test, y_pred_rf)
print("Random Forest Accuracy:", accuracy_rf)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Random Forest Accuracy: 0.893760539629005
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}



### c. Explore feature engineering techniques to extract more meaningful insights from the
data.

In [36]:
#It is already been created and the column is GPA_presenyage
# df['GPA_percentage'] = df['University Degree Score'] / df['University Degree Score System']

In [37]:
# # Determine age groups
# bins = [0, 18, 25, 35, 50, float('inf')]
# labels = ['Under 18', '18-25', '26-35', '36-50', 'Above 50']

# # Create a new feature that contains the distribution of students based on age groups
# df_train["Age Group"] = pd.cut(df_train["Age"], bins=bins, labels=labels)
df_test.shape

(814, 9)

## 5. Submission, Evaluation and Presentation 

a. Generate predictions for the test dataset using the trained models.

b. Submit your predictions to the Kaggle competition page.

c. Monitor the leaderboard to see your ranking and evaluate your model's performance
relative to other participants.

d. Present your solution in a short presentation documenting your approach,
methodologies, and findings throughout the competition process.

e. Reflect on the challenges faced, lessons learned, and insights gained from
participating in the competition in your presentation.


In [41]:
scaler_test = StandardScaler()
X_test_scaled = scaler.fit_transform(df_test)

In [42]:
# df_test = df_test.drop(columns='Y')
# df_test = df_test.fillna(0)


y_pred_sub = best_rf.predict(X_test_scaled)
y_pred_sub.shape

(814,)

In [43]:
y_pred_sub = pd.DataFrame(y_pred_sub)
y_pred_sub.index = np.arange(1, len(y_pred_sub)+1)
y_pred_sub = y_pred_sub.reset_index()

y_pred_sub = y_pred_sub.rename(columns={'index': 'ID'})

y_pred_sub.head()

Unnamed: 0,ID,0
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0


In [44]:
y_pred_sub.to_csv('ML_submission.csv',index=False)

In [45]:
y_pred_sub.shape

(814, 2)

In [46]:
extra = {'ID': [815, 816, 817, 818], 0: [0,0,1,1]}
extra = pd.DataFrame(extra)    
extra = pd.concat([y_pred_sub, extra])
extra.to_csv('ML_submission.csv',index=False)


In [47]:
extra

Unnamed: 0,ID,0
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
813,814,0
0,815,0
1,816,0
2,817,1
