In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [15]:
df = pd.read_csv('erasmus.csv')
df.head()

Unnamed: 0,INDEX,COUNTRIES,UNIVERSITIES,FACULTIES,DEPARTMENTS,EXAM SCORE,GRANT
0,1,ITALIA,UNIVERSITA DEGLI STUDI DI ROMA LA SAPIENZA,FACULTY OF ARTS AND SCIENCES,ENGLISH LANGUAGE AND LITERATURE,98.5,1
1,2,ITALIA,ALMA MATER STUDIORUM - UNIVERSITA DI BOLOGNA,FACULTY OF ARTS AND SCIENCES,SOCIOLOGY,97.1,1
2,3,GERMAN,UNIVERSITAET BIELEFELD,FACULTY OF ARTS AND SCIENCES,PSYCHOLOGY,96.8,1
3,4,GERMAN,HOCHSCHULE FUR ANGEWANDTE WISSENSCHAFTEN HAMBURG,FACULTY OF HEALTH SCIENCES,NUTRITION AND DIETETICS,96.5,1
4,5,ITALIA,UNIVERSITA DEGLI STUDI DI ROMA LA SAPIENZA,FACULTY OF ARTS AND SCIENCES,ENGLISH LANGUAGE AND LITERATURE,96.32,1


In [16]:
df.describe()

Unnamed: 0,INDEX,EXAM SCORE,GRANT
count,341.0,341.0,341.0
mean,171.0,79.138944,0.434018
std,98.582453,8.565175,0.496356
min,1.0,51.4,0.0
25%,86.0,73.57,0.0
50%,171.0,79.3,0.0
75%,256.0,85.28,1.0
max,341.0,98.5,1.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341 entries, 0 to 340
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   INDEX         341 non-null    int64  
 1   COUNTRIES     341 non-null    object 
 2   UNIVERSITIES  341 non-null    object 
 3   FACULTIES     341 non-null    object 
 4   DEPARTMENTS   339 non-null    object 
 5   EXAM SCORE    341 non-null    float64
 6   GRANT         341 non-null    int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 18.8+ KB


In [18]:
# Drop the rows with missing values
df = df.dropna()


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 339 entries, 0 to 340
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   INDEX         339 non-null    int64  
 1   COUNTRIES     339 non-null    object 
 2   UNIVERSITIES  339 non-null    object 
 3   FACULTIES     339 non-null    object 
 4   DEPARTMENTS   339 non-null    object 
 5   EXAM SCORE    339 non-null    float64
 6   GRANT         339 non-null    int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 21.2+ KB


In [20]:
#Whether a student can receive a grant depends on their faculty (because each faculty has its own criteria and contingent) and exam score. 
#So, we may need to examine FACULTIES column.
df['FACULTIES'].value_counts()

FACULTIES
FACULTY OF ECONOMICS AND ADMINISTRATIVE SCIENCES    73
FACULTY OF ENGINEERING                              69
FACULTY OF ARTS AND SCIENCES                        53
FACULTY OF COMMUNICATION                            32
FACULTY OF LAW                                      23
 FACULTY OF COMPUTER AND INFORMATION SCIENCES       22
FACULTY OF ARCHITECTURE                             18
FACULTY OF HEALTH SCIENCES                          16
FACULTY OF EDUCATION                                16
FACULTY OF FINE ARTS                                 8
FACULTY OF PHARMACY                                  5
FACULTY OF MEDICINE                                  2
GRADUATE SCHOOL OF SOCIAL SCIENCES                   2
Name: count, dtype: int64

In [21]:
#So, there are two faculties that they have fewer students. 
#One of them is GRADUATE SCHOOL OF SOCIAL SCIENCES and other is FACULTY OF MEDICINE. We may need to examine these separately.
#For to do the this, we can use concat method in pandas library.
graduate_school_of_social_sciences = df[df['FACULTIES'] == 'GRADUATE SCHOOL OF SOCIAL SCIENCES']
faculty_of_medicine = df[df['FACULTIES'] == 'FACULTY OF MEDICINE']

concat = pd.concat([graduate_school_of_social_sciences, faculty_of_medicine])
concat

Unnamed: 0,INDEX,COUNTRIES,UNIVERSITIES,FACULTIES,DEPARTMENTS,EXAM SCORE,GRANT
158,159,POLAND,WYZSZA SZKOLA MENEDZERSKA W WARSZAWİE,GRADUATE SCHOOL OF SOCIAL SCIENCES,BUSINESS ADMINISTRATION,79.83,1
228,229,BELGIUM,KATHOLIEKE UNIVERSITEIT LEUVEN,GRADUATE SCHOOL OF SOCIAL SCIENCES,PRICATE LAW,75.77,1
49,50,SLOVENIA,UNIVERZA V LJUBLJANI,FACULTY OF MEDICINE,MEDICINE,88.55,1
340,341,GERMAN,JULIUS-MAXIMILIANS-UNIVERSITAT WURZBURG,FACULTY OF MEDICINE,MEDICINE,51.4,0


In [22]:
#Unlike the FACULTY OF MEDICINE, all students at the GRADUATE SCHOOL OF SOCIAL SCIENCES received grants.
#This can cause problems when making predictions. So, if all students in a faculty receive grant, we need to remove that faculty from the dataset.
#We can use a loop that provides this functionality.

faculties = df['FACULTIES'].unique() #to store all faculties

for faculty in faculties:
    faculty_data = df[df['FACULTIES'] == faculty]
    if len(faculty_data['GRANT'].unique()) == 1:
        df = df[df['FACULTIES'] != faculty]

df['FACULTIES'].value_counts()
#Now, there is no GRADUATE SCHOOL OF SOCIAL SCIENCES

FACULTIES
FACULTY OF ECONOMICS AND ADMINISTRATIVE SCIENCES    73
FACULTY OF ENGINEERING                              69
FACULTY OF ARTS AND SCIENCES                        53
FACULTY OF COMMUNICATION                            32
FACULTY OF LAW                                      23
 FACULTY OF COMPUTER AND INFORMATION SCIENCES       22
FACULTY OF ARCHITECTURE                             18
FACULTY OF HEALTH SCIENCES                          16
FACULTY OF EDUCATION                                16
FACULTY OF FINE ARTS                                 8
FACULTY OF MEDICINE                                  2
Name: count, dtype: int64

In [23]:
#to define the input and output variables
x = df[['EXAM SCORE', 'FACULTIES']]
y = df['GRANT']

#FACULTIES column is a categorical column. So, we need to convert it to numerical values.
encoder = OneHotEncoder()
x_encoded = encoder.fit_transform(x[['FACULTIES']])

#Merge the encoded FACULTIES column with the EXAM SCORE column.
x_final = np.hstack((x[['EXAM SCORE']].values, x_encoded.toarray()))

In [24]:
#Split the data into training and test sets, using 20% of data for training.
x_train, x_test, y_train, y_test = train_test_split(x_final, y, test_size=0.2, random_state=37)

#Create a logistic regression model and train it.
model = LogisticRegression()
model.fit(x_train, y_train)

#Make predictions on the test set.
y_pred = model.predict(x_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
#results
score = model.score(x_test, y_test)
print('Accuracy:', score)
print(61 * '-')
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.9850746268656716
-------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        42
           1       1.00      0.96      0.98        25

    accuracy                           0.99        67
   macro avg       0.99      0.98      0.98        67
weighted avg       0.99      0.99      0.99        67

