In [7]:
import pandas as pd

df = pd.read_csv("student_success_dataset.csv")
print("Sample Rows")
print(df.head())

Sample Rows
   StudyHours  Attendance  PastScore Internet  SleepHours Passed
0           2          60         40      Yes           5     No
1           5          80         60       No           6    Yes
2           8          90         75      Yes           8    Yes
3           3          70         50      Yes           5     No
4           7          85         70       No           7    Yes


In [8]:
print(df.shape[0],df.shape[1])

20 6


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   StudyHours  20 non-null     int64 
 1   Attendance  20 non-null     int64 
 2   PastScore   20 non-null     int64 
 3   Internet    20 non-null     object
 4   SleepHours  20 non-null     int64 
 5   Passed      20 non-null     object
dtypes: int64(4), object(2)
memory usage: 1.1+ KB
None


In [10]:
print(df.describe(include = 'all'))

        StudyHours  Attendance  PastScore Internet  SleepHours Passed
count    20.000000   20.000000   20.00000       20   20.000000     20
unique         NaN         NaN        NaN        2         NaN      2
top            NaN         NaN        NaN      Yes         NaN    Yes
freq           NaN         NaN        NaN       11         NaN     11
mean      5.100000   74.150000   59.10000      NaN    6.050000    NaN
std       2.845125   17.372922   19.41215      NaN    2.038446    NaN
min       1.000000   40.000000   25.00000      NaN    3.000000    NaN
25%       2.750000   60.000000   43.75000      NaN    4.750000    NaN
50%       5.000000   77.000000   62.50000      NaN    6.000000    NaN
75%       7.250000   88.500000   72.75000      NaN    7.250000    NaN
max      10.000000   99.000000   90.00000      NaN   10.000000    NaN


In [11]:
print(df.isnull().sum())

StudyHours    0
Attendance    0
PastScore     0
Internet      0
SleepHours    0
Passed        0
dtype: int64


## Data Preprocessing

In [12]:
from sklearn.preprocessing import LabelEncoder

In [14]:
print("Missing Value in each column")
print(df.isnull().sum())

Missing Value in each column
StudyHours    0
Attendance    0
PastScore     0
Internet      0
SleepHours    0
Passed        0
dtype: int64


In [15]:
le = LabelEncoder()
df['Internet'] = le.fit_transform(df["Internet"])
df['Passed'] = le.fit_transform(df["Passed"])

print("After Encoding")
print(df.head())

After Encoding
   StudyHours  Attendance  PastScore  Internet  SleepHours  Passed
0           2          60         40         1           5       0
1           5          80         60         0           6       1
2           8          90         75         1           8       1
3           3          70         50         1           5       0
4           7          85         70         0           7       1


In [16]:
print("Data Types after Cleaning")
print(df.dtypes)

Data Types after Cleaning
StudyHours    int64
Attendance    int64
PastScore     int64
Internet      int64
SleepHours    int64
Passed        int64
dtype: object


## Feature Scaling

In [18]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
le = LabelEncoder()
df['Internet'] = le.fit_transform(df['Internet'])
df['Passed'] = le.fit_transform(df['Passed'])

In [23]:
features = ['StudyHours','Attendance','PastScore','SleepHours']
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[features] = scaler.fit_transform(df[features])

In [30]:
x = df_scaled[features]
y = df_scaled['Passed']

In [31]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [32]:
model = LogisticRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)

In [None]:
print("classification report")
print(classification_report(y_test,y_pred))

conf_matrix = confusion_matrix(y_test,y_pred)
plt.figure(figsize = (6,4))
sns.heatmap(conf_matrix, annot = True, fmt = 'd', cmap = 'Blues', xticklabels = ["Fail","Pass"],yticklabel= ["Fail","Pass"])
