# Models:

## Import Required Libraries

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Preprocessing,
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

### ML Models,
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

### Model Tuning,
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold

## Logistic Regression

In [25]:
df_train=pd.read_pickle('Cleaned_train.pickle')
X = df_train.drop('Y', axis=1)
y = df_train['Y']

In [26]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
if 'Program Skill Level' in categorical_cols:
    categorical_cols.remove('Program Skill Level')
skill_level_categories = [['متقدم', 'متوسط', 'مبتدئ', 'مفقود']]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('ordinal', OrdinalEncoder(categories=skill_level_categories), ['Program Skill Level'])
    ],
    remainder='passthrough'
)
X_processed = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=34)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8676923076923077
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1089
           1       0.64      0.42      0.51       211

    accuracy                           0.87      1300
   macro avg       0.77      0.69      0.71      1300
weighted avg       0.85      0.87      0.86      1300



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1089
           1       0.72      0.37      0.49       211

    accuracy                           0.87      1300
   macro avg       0.80      0.67      0.71      1300
weighted avg       0.86      0.87      0.86      1300

