In [348]:
import pypyodbc as odbc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


In [349]:
DRIVER_NAME = 'SQL Server'
SERVER_NAME = 'Haitham'
DATABASE_NAME = 'ITIExaminationSystem'

CONNECTION_STRING = f'DRIVER={{{DRIVER_NAME}}};SERVER={SERVER_NAME};DATABASE={DATABASE_NAME};Trusted_Connection=yes;'

In [350]:
conn = odbc.connect(CONNECTION_STRING) 
print(conn)

<pypyodbc.Connection object at 0x0000023FABACE5C0>


In [351]:
query = "select s.Student_ID, s.Student_Faculty, s.Student_Faculty_Grade,s.Student_Gender,s.Student_ITI_Status, s.Student_Marital_Status, sum(sea.student_grade) as [Total grade] , c.Company_Name from Student as s left join Student_Company as sc on s.Student_ID = sc.Student_ID left join Company as c on sc.Company_ID = c.Company_ID inner join Student_Exam_Answer as sea on sea.Student_ID = s.Student_ID where sea.Exam_ID in (select Exam_ID from exam where Exam_Type = 'Normal') group by s.Student_ID, s.Student_Faculty, s.Student_Faculty_Grade, s.Student_Gender, s.Student_ITI_Status, s.Student_Marital_Status, c.Company_Name"
df = pd.read_sql(query, conn)

  df = pd.read_sql(query, conn)


In [352]:
df.head()

Unnamed: 0,student_id,student_faculty,student_faculty_grade,student_gender,student_iti_status,student_marital_status,total grade,company_name
0,12184,Faculty of Commerce,Very Good,Male,Graduated,Single,89.0,
1,28225,Faculty of Information Systems,Good,Female,Graduated,Single,85.0,
2,30140,Faculty of Computers Sciences,Good,Female,Graduated,Single,80.0,Vodafone Intelligent Solutions (VOIS)
3,38195,Faculty of Applied Arts,Very Good,Male,Graduated,Single,76.0,
4,22265,Faculty of Applied Arts,Pass,Male,Graduated,Married,63.0,Raya Holding


In [353]:
df.shape

(12900, 8)

In [354]:
df['student_faculty'].value_counts()

student_faculty
Faculty of Computers Sciences                 3280
Faculty of Engineering                        2596
Faculty of Information Systems                 860
Faculty of Business Administration             729
Faculty of Commerce                            725
Faculty of Agriculture                         701
Faculty of Science                             700
Faculty of Fine Arts                           695
Faculty of Applied Arts                        683
Faculty of Arts                                672
Faculty of Economics and Political Science     641
Faculty of Education                           618
Name: count, dtype: int64

In [355]:
df.isnull().sum()

student_id                   0
student_faculty              0
student_faculty_grade        0
student_gender               0
student_iti_status           0
student_marital_status       0
total grade                  0
company_name              8951
dtype: int64

In [356]:
df['company_name'].fillna('Unemployed', inplace=True) 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['company_name'].fillna('Unemployed', inplace=True)


In [357]:
#df['salary'].fillna(0, inplace=True) 

In [358]:
df.isnull().sum()

student_id                0
student_faculty           0
student_faculty_grade     0
student_gender            0
student_iti_status        0
student_marital_status    0
total grade               0
company_name              0
dtype: int64

In [359]:
df.duplicated().sum() 

np.int64(0)

In [360]:
df.drop(columns=['student_id'], inplace=True)

In [361]:
df.head()

Unnamed: 0,student_faculty,student_faculty_grade,student_gender,student_iti_status,student_marital_status,total grade,company_name
0,Faculty of Commerce,Very Good,Male,Graduated,Single,89.0,Unemployed
1,Faculty of Information Systems,Good,Female,Graduated,Single,85.0,Unemployed
2,Faculty of Computers Sciences,Good,Female,Graduated,Single,80.0,Vodafone Intelligent Solutions (VOIS)
3,Faculty of Applied Arts,Very Good,Male,Graduated,Single,76.0,Unemployed
4,Faculty of Applied Arts,Pass,Male,Graduated,Married,63.0,Raya Holding


In [362]:
print(df.dtypes)

student_faculty            object
student_faculty_grade      object
student_gender             object
student_iti_status         object
student_marital_status     object
total grade               float64
company_name               object
dtype: object


In [363]:
df['student_faculty_grade'] = df['student_faculty_grade'].map({'Pass': 0, 'Good': 1, 'Very Good': 2, 'Excellent': 3})

In [364]:
df['student_iti_status'] = df['student_iti_status'].map({'Failed to Graduate': 0, 'Graduated': 1})

In [365]:
df.shape

(12900, 7)

In [None]:
# df = pd.get_dummies(df, columns=["student_faculty","student_gender","student_marital_status"], drop_first=False, dtype= 'uint8')

In [367]:
df.head()

Unnamed: 0,student_faculty_grade,student_iti_status,total grade,company_name,student_faculty_Faculty of Agriculture,student_faculty_Faculty of Applied Arts,student_faculty_Faculty of Arts,student_faculty_Faculty of Business Administration,student_faculty_Faculty of Commerce,student_faculty_Faculty of Computers Sciences,student_faculty_Faculty of Economics and Political Science,student_faculty_Faculty of Education,student_faculty_Faculty of Engineering,student_faculty_Faculty of Fine Arts,student_faculty_Faculty of Information Systems,student_faculty_Faculty of Science,student_gender_Female,student_gender_Male,student_marital_status_Married,student_marital_status_Single
0,2,1,89.0,Unemployed,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
1,1,1,85.0,Unemployed,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
2,1,1,80.0,Vodafone Intelligent Solutions (VOIS),0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
3,2,1,76.0,Unemployed,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,0,1,63.0,Raya Holding,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [368]:
X = df[[
    'student_faculty',          # categorical
    'student_faculty_grade',    # numeric (mapped)
    'student_gender',           # categorical
    'student_iti_status',       # numeric (mapped)
    'student_marital_status',   # categorical
    'total grade'               # numeric
]].copy()
y = df['company_name'].astype(str).copy()

Unnamed: 0,student_faculty_grade,student_iti_status,total grade,student_faculty_Faculty of Agriculture,student_faculty_Faculty of Applied Arts,student_faculty_Faculty of Arts,student_faculty_Faculty of Business Administration,student_faculty_Faculty of Commerce,student_faculty_Faculty of Computers Sciences,student_faculty_Faculty of Economics and Political Science,student_faculty_Faculty of Education,student_faculty_Faculty of Engineering,student_faculty_Faculty of Fine Arts,student_faculty_Faculty of Information Systems,student_faculty_Faculty of Science,student_gender_Female,student_gender_Male,student_marital_status_Married,student_marital_status_Single
0,2,1,89.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1
1,1,1,85.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1
2,1,1,80.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
3,2,1,76.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,0,1,63.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [369]:
y = df['company_name'].astype(str).copy()


0                               Unemployed
1                               Unemployed
2    Vodafone Intelligent Solutions (VOIS)
3                               Unemployed
4                             Raya Holding
Name: company_name, dtype: object

In [370]:
joblib.dump(list(X.columns), 'model_columns.pkl')

['model_columns.pkl']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# _, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

In [372]:
# print(X_train.columns.tolist())

['student_faculty_grade', 'student_iti_status', 'total grade', 'student_faculty_Faculty of Agriculture', 'student_faculty_Faculty of Applied Arts', 'student_faculty_Faculty of Arts', 'student_faculty_Faculty of Business Administration', 'student_faculty_Faculty of Commerce', 'student_faculty_Faculty of Computers Sciences', 'student_faculty_Faculty of Economics and Political Science', 'student_faculty_Faculty of Education', 'student_faculty_Faculty of Engineering', 'student_faculty_Faculty of Fine Arts', 'student_faculty_Faculty of Information Systems', 'student_faculty_Faculty of Science', 'student_gender_Female', 'student_gender_Male', 'student_marital_status_Married', 'student_marital_status_Single']


In [None]:
num_features = ['student_faculty_grade', 'student_iti_status', 'total grade']
cat_features = ['student_faculty', 'student_gender', 'student_marital_status']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)
    ],
    verbose_feature_names_out=False  # keep clean feature names (no num__/cat__ prefixes)
)
# X_train_scaled = preprocessor.fit_transform(X_train)
# X_test_scaled = preprocessor.transform(X_test)

In [None]:
# X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=preprocessor.get_feature_names_out())

In [375]:
# X_train_scaled_df.head()   

Unnamed: 0,student_faculty_grade,student_iti_status,total grade,student_faculty_Faculty of Agriculture,student_faculty_Faculty of Applied Arts,student_faculty_Faculty of Arts,student_faculty_Faculty of Business Administration,student_faculty_Faculty of Commerce,student_faculty_Faculty of Computers Sciences,student_faculty_Faculty of Economics and Political Science,student_faculty_Faculty of Education,student_faculty_Faculty of Engineering,student_faculty_Faculty of Fine Arts,student_faculty_Faculty of Information Systems,student_faculty_Faculty of Science,student_gender_Female,student_gender_Male,student_marital_status_Married,student_marital_status_Single
0,-0.430422,0.256487,-1.063021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.348851,0.256487,1.618151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.348851,0.256487,2.075912,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.348851,0.256487,1.356573,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1.348851,0.256487,0.57184,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [376]:
# print(X_train_scaled_df.columns.tolist())

['student_faculty_grade', 'student_iti_status', 'total grade', 'student_faculty_Faculty of Agriculture', 'student_faculty_Faculty of Applied Arts', 'student_faculty_Faculty of Arts', 'student_faculty_Faculty of Business Administration', 'student_faculty_Faculty of Commerce', 'student_faculty_Faculty of Computers Sciences', 'student_faculty_Faculty of Economics and Political Science', 'student_faculty_Faculty of Education', 'student_faculty_Faculty of Engineering', 'student_faculty_Faculty of Fine Arts', 'student_faculty_Faculty of Information Systems', 'student_faculty_Faculty of Science', 'student_gender_Female', 'student_gender_Male', 'student_marital_status_Married', 'student_marital_status_Single']


In [377]:
class_percentage = df['company_name'].value_counts(normalize=True) * 100
print(class_percentage)

company_name
Unemployed                                 69.387597
Orange Egypt                                1.759690
Robusta                                     1.751938
Vodafone Intelligent Solutions (VOIS)       1.697674
Etisalat by e&                              1.348837
ArabyAds                                    1.333333
Fawry                                       1.317829
Paymob                                      1.248062
Talabat (Delivery Hero)                     1.248062
Raya Holding                                1.124031
Jumia                                       1.100775
Capgemini                                   0.945736
Orascom Construction                        0.922481
Accenture                                   0.868217
Instabug                                    0.860465
CIB (Commercial International Bank)         0.829457
IBM                                         0.790698
Siemens                                     0.697674
Schneider Electric               

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42, k_neighbors=2)

In [379]:
# X_resampled, y_resampled = smote.fit_resample(
#     X_train_scaled_df[
#         'student_faculty_grade', 'student_iti_status', 'total grade',
#         'student_faculty_Faculty of Agriculture', 'student_faculty_Faculty of Applied Arts', 
#         'student_faculty_Faculty of Arts', 'student_faculty_Faculty of Business Administration',
#         'student_faculty_Faculty of Commerce', 'student_faculty_Faculty of Computers Sciences',
#         'student_faculty_Faculty of Economics and Political Science', 'student_faculty_Faculty of Education',
#         'student_faculty_Faculty of Engineering', 'student_faculty_Faculty of Fine Arts', 'student_faculty_Faculty of Information Systems',
#         'student_faculty_Faculty of Science', 'student_gender_Female', 'student_gender_Male',
#         'student_marital_status_Married', 'student_marital_status_Single'],
#     y_train)
# df_smote = pd.DataFrame(
#     X_resampled,
#     columns=preprocessor.get_feature_names_out())
# df_smote['company_name'] = y_resampled

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', rf)
])
pipeline.fit(X_train, y_train)


KeyError: ('student_faculty_grade', 'student_iti_status', 'total grade', 'student_faculty_Faculty of Agriculture', 'student_faculty_Faculty of Applied Arts', 'student_faculty_Faculty of Arts', 'student_faculty_Faculty of Business Administration', 'student_faculty_Faculty of Commerce', 'student_faculty_Faculty of Computers Sciences', 'student_faculty_Faculty of Economics and Political Science', 'student_faculty_Faculty of Education', 'student_faculty_Faculty of Engineering', 'student_faculty_Faculty of Fine Arts', 'student_faculty_Faculty of Information Systems', 'student_faculty_Faculty of Science', 'student_gender_Female', 'student_gender_Male', 'student_marital_status_Married', 'student_marital_status_Single')

In [None]:
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc:.4f}')

In [None]:
print("Classification report:")
print(classification_report(y_test, y_pred, zero_division=0))

In [None]:
labels = pipeline.classes_
print("Classes:", labels)

In [None]:
joblib.dump(pipeline, "model.pkl")
print("Saved pipeline to model.pkl")

In [None]:
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
print("Produced feature names count:", len(feature_names))

In [None]:
# Save feature names if you want (not required for pipeline-based deployment)
# joblib.dump(feature_names.tolist(), "feature_names.pkl")

In [None]:
# class_percentage = df_smote['company_name'].value_counts(normalize=True) * 100
# print(class_percentage)

company_name
Unemployed                                 1.818182
Robusta                                    1.818182
Majid Al Futtaim                           1.818182
Paymob                                     1.818182
Orange Egypt                               1.818182
Microsoft                                  1.818182
Bosch                                      1.818182
Orascom Construction                       1.818182
Capgemini                                  1.818182
NAGWA                                      1.818182
Raya Holding                               1.818182
Etisalat by e&                             1.818182
Hassan Allam Holding                       1.818182
Deloitte                                   1.818182
Atos                                       1.818182
Vodafone Intelligent Solutions (VOIS)      1.818182
Oracle                                     1.818182
CIB (Commercial International Bank)        1.818182
EY (Ernst & Young)                         1.818182

In [None]:
# df_smote.head()

Unnamed: 0,num__student_faculty_grade,num__student_iti_status,num__total grade,cat__student_faculty_Faculty of Applied Arts,cat__student_faculty_Faculty of Arts,cat__student_faculty_Faculty of Business Administration,cat__student_faculty_Faculty of Commerce,cat__student_faculty_Faculty of Computers Sciences,cat__student_faculty_Faculty of Economics and Political Science,cat__student_faculty_Faculty of Education,cat__student_faculty_Faculty of Engineering,cat__student_faculty_Faculty of Fine Arts,cat__student_faculty_Faculty of Information Systems,cat__student_faculty_Faculty of Science,cat__student_gender_Male,cat__student_marital_status_Single,company_name
0,-0.42183,0.260115,0.048716,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,Unemployed
1,-0.42183,0.260115,-0.147109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,Robusta
2,-0.42183,0.260115,0.37509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Unemployed
3,1.354296,0.260115,0.440365,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Majid Al Futtaim
4,-0.42183,0.260115,-0.473483,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,Unemployed


In [None]:
# X1 = df_smote.drop(['company_name'], axis=1) # features
# y1 = df_smote['company_name'] # labels

In [None]:
# X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42) # 80-20 

In [None]:
# X1.head()

Unnamed: 0,num__student_faculty_grade,num__student_iti_status,num__total grade,cat__student_faculty_Faculty of Applied Arts,cat__student_faculty_Faculty of Arts,cat__student_faculty_Faculty of Business Administration,cat__student_faculty_Faculty of Commerce,cat__student_faculty_Faculty of Computers Sciences,cat__student_faculty_Faculty of Economics and Political Science,cat__student_faculty_Faculty of Education,cat__student_faculty_Faculty of Engineering,cat__student_faculty_Faculty of Fine Arts,cat__student_faculty_Faculty of Information Systems,cat__student_faculty_Faculty of Science,cat__student_gender_Male,cat__student_marital_status_Single
0,-0.42183,0.260115,0.048716,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,-0.42183,0.260115,-0.147109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,-0.42183,0.260115,0.37509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.354296,0.260115,0.440365,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.42183,0.260115,-0.473483,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [None]:
# # Create LogisticRegression and fit the model
# LR = LogisticRegression()
# LR.fit(X1_train, y1_train)

In [None]:
# LR_pred = LR.predict(X1_test)
# accuracy = accuracy_score(y1_test, LR_pred)
# print(f"Accuracy: {accuracy:.2f}")

In [None]:
# from sklearn.svm import SVC # SVC: Support Vector Classifier

# # Initialize and train the model
# svm = SVC(kernel='linear', probability=True)
# svm.fit(X1_train, y1_train)

In [None]:
# svm_pred = svm.predict(X1_test)

In [None]:

# # Initialize and train the model
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X1_train, y1_train)

In [None]:
# knn_pred = knn.predict(X1_test)

In [None]:
# accuracy = accuracy_score(y1_test, knn_pred)
# print(f"Accuracy: {accuracy:.2f}")

In [None]:
# print(classification_report(y1_test, knn_pred))

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# tree = DecisionTreeClassifier(max_depth=3, random_state=42)
# tree.fit(X1_train, y1_train)

In [None]:
# tree_pred = tree.predict(X1_test)

In [None]:
# accuracy = accuracy_score(y1_test, tree_pred)
# print(f"Accuracy: {accuracy:.2f}")

In [None]:
# from sklearn.naive_bayes import GaussianNB

# # Train Naive Bayes model
# GNB = GaussianNB()
# GNB.fit(X1_train, y1_train)

In [None]:
# gnb_pred = GNB.predict(X1_test)

In [None]:
# accuracy = accuracy_score(y1_test, gnb_pred)
# print(f"Accuracy: {accuracy:.2f}")

In [None]:
# print(X1_train.columns.tolist())

['num__student_faculty_grade', 'num__student_iti_status', 'num__total grade', 'cat__student_faculty_Faculty of Applied Arts', 'cat__student_faculty_Faculty of Arts', 'cat__student_faculty_Faculty of Business Administration', 'cat__student_faculty_Faculty of Commerce', 'cat__student_faculty_Faculty of Computers Sciences', 'cat__student_faculty_Faculty of Economics and Political Science', 'cat__student_faculty_Faculty of Education', 'cat__student_faculty_Faculty of Engineering', 'cat__student_faculty_Faculty of Fine Arts', 'cat__student_faculty_Faculty of Information Systems', 'cat__student_faculty_Faculty of Science', 'cat__student_gender_Male', 'cat__student_marital_status_Single']


In [None]:
# # Initialize and train Random Forest
# # rf = RandomForestClassifier(n_estimators=100, random_state=42)
# # rf.fit(X1_train, y1_train)

# rf = RandomForestClassifier(n_estimators=100, random_state=42)
# model = Pipeline(steps=[('preprocessor', preprocessor), ('model', rf)])
# model.fit(X1_train, y1_train)

ValueError: A given column is not a column of the dataframe

In [None]:
# rf_predictions = model.predict(X1_test)

In [None]:
# accuracy = accuracy_score(y1_test, rf_predictions)
# print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.86


In [None]:
# print("Random Forest Classification Report:")
# print(classification_report(y1_test, rf_predictions))

Random Forest Classification Report:
                                         precision    recall  f1-score   support

                              Accenture       0.78      0.77      0.78      1406
                           Amazon (AWS)       0.85      0.85      0.85      1412
                       Arab Contractors       0.88      0.87      0.87      1463
                               ArabyAds       0.71      0.71      0.71      1434
                                   Atos       0.82      0.82      0.82      1433
                            Banque Misr       0.86      0.86      0.86      1427
                                  Bosch       0.84      0.83      0.83      1407
    CIB (Commercial International Bank)       0.83      0.82      0.83      1393
                              Capgemini       0.78      0.79      0.78      1472
                                  Cisco       0.96      0.95      0.95      1458
                      Dell Technologies       0.95      0.96      0.95 

In [None]:
# joblib.dump(model, 'model.pkl')


['company_name_model.pkl']