In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)

In [12]:
df = pd.read_csv('bi.csv', encoding='latin-1')


print('\nShape of the data: ')
print(f'Rows: {df.shape[0]} columns: {df.shape[1]}')

print("\nDataset info: ")
print(df.info())

print('\nSummary staticts: ')
print(df.describe(include='all'))

print('\nMissing Values: ')
print(df.isnull().sum())

print('First five rows of data: ')
print(df.head())


Shape of the data: 
Rows: 77 columns: 11

Dataset info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fNAME          77 non-null     object 
 1   lNAME          77 non-null     object 
 2   Age            77 non-null     int64  
 3   gender         77 non-null     object 
 4   country        77 non-null     object 
 5   residence      77 non-null     object 
 6   entryEXAM      77 non-null     int64  
 7   prevEducation  77 non-null     object 
 8   studyHOURS     77 non-null     int64  
 9   Python         75 non-null     float64
 10  DB             77 non-null     int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 6.7+ KB
None

Summary staticts: 
         fNAME  lNAME        Age  gender country residence  entryEXAM  \
count       77     77  77.000000      77      77        77  77.000000   
unique      71     66        NaN  

#### Processing (Clean Data)

In [53]:
data = df.copy()
data = data.drop(['fNAME','lNAME'], axis=1)
data['gender'] = data['gender'].replace({
  'Male': 'M',
  'Female': 'F',
  'male': 'M',
  'female': 'F'
})
data['residence'] = data['residence'].replace({
    'BI-Residence': 'BI Residence',
    'BIResidence': 'BI Residence',
    'BI_Residence': 'BI Residence'
})
data['prevEducation'] = data['prevEducation'].replace({
    'diploma': 'Diploma',
    'DIPLOMA': 'Diploma',
    'Diplomaaa': 'Diploma',
    'High School': 'HighSchool',
    'Barrrchelors': 'Bachelors'
})
data = data.dropna(subset='Python')

#### Encoding

In [54]:
le = LabelEncoder()
data['gender'] = le.fit_transform(data['gender'])
data = pd.get_dummies(data, columns=['country', 'residence', 'prevEducation'], drop_first=True, dtype=int)
data.head()

Unnamed: 0,Age,gender,entryEXAM,studyHOURS,Python,DB,country_France,country_Germany,country_Italy,country_Kenya,...,country_Spain,country_UK,country_Uganda,country_norway,residence_Private,residence_Sognsvann,prevEducation_Diploma,prevEducation_Doctorate,prevEducation_HighSchool,prevEducation_Masters
0,44,0,72,158,59.0,55,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,60,1,79,150,60.0,75,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
2,25,1,55,130,74.0,50,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0
4,23,0,65,122,91.0,80,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,25,0,66,130,88.0,59,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


#### Feature Scaling

In [70]:
features = ['Age','entryEXAM','studyHOURS']
scaled_data = data.copy()
scaler = StandardScaler()
scaled_data[features] = scaler.fit_transform(scaled_data[features])

#### Split the Data

In [71]:
feature_cols = [col for col in scaled_data.columns if col not in ['Python','DB']]
X = scaled_data[feature_cols]
y = scaled_data[['Python','DB']]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

#### Train the model

In [72]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Model Prediction

In [73]:
y_pred = model.predict(X_test)
print(y_pred)

[[62.53 57.78]
 [80.87 68.43]
 [76.17 55.59]
 [77.88 57.66]
 [87.24 89.22]
 [83.24 73.74]
 [81.98 78.96]
 [74.48 56.25]
 [72.32 65.2 ]
 [84.7  78.28]
 [81.71 60.71]
 [87.58 84.42]
 [81.77 72.07]
 [87.09 80.96]
 [76.3  56.3 ]]


Evaluate the Model

In [74]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute ERROR (MAE): ", round(mae, 2))
print("Mean Squared ERROR (MSE): ", round(mse, 2))
print("Root Mean Squared ERROR (RMSE): ", round(rmse, 2))
print("r^2 Score (Model accurecy): ", round(r2, 4))

Mean Absolute ERROR (MAE):  7.1
Mean Squared ERROR (MSE):  111.0
Root Mean Squared ERROR (RMSE):  10.54
r^2 Score (Model accurecy):  0.1777
