# Simple Linear regression with evaluation metrics

In [11]:
import pandas as pd
import numpy as np
import random

In [12]:


data = {
    "Year of Experiences": [3, 5, 2, 7, 4, 6, 1, 9, 8, 10, 2, 4, 6, 8, 5, 3, 7, 9, 4, 6],
    "Salary": [45000, 60000, 38000, 75000, 52000, 68000, 30000, 90000, 82000, 100000, 36000, 50000, 72000, 85000, 59000, 42000, 77000, 92000, 54000, 69000]
}

dataset = pd.DataFrame(data)
dataset


Unnamed: 0,Year of Experiences,Salary
0,3,45000
1,5,60000
2,2,38000
3,7,75000
4,4,52000
5,6,68000
6,1,30000
7,9,90000
8,8,82000
9,10,100000


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score





In [14]:
# Split dataset into features (X) and target variable (y)
X = dataset[['Year of Experiences']]
y = dataset['Salary']

# Split the dataset into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create linear regression model
model = LinearRegression()

# Train the model using the training sets
model.fit(X, y)




In [15]:
# Make predictions using the your predefined input data
experience = np.array([[11]])
salary = model.predict(experience)
print(salary)


[107334.3836156]




In [16]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Assign actual and predicted values
y_test = [109000]  # Wrap the integer in a list
y_pred = salary


# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)



Mean Absolute Error (MAE): 1665.6163844033144
Mean Squared Error (MSE): 2774277.9399927696
Root Mean Squared Error (RMSE): 1665.6163844033144


# Exercise: train the regression model using train test method and them find the errors using the different method provided above.

In [17]:
# do coding here
data = {
    "Year of Experiences": [3, 5, 2, 7, 4, 6, 1, 9, 8, 10, 2, 4, 6, 8, 5, 3, 7, 9, 4, 6],
    "Salary": [45000, 60000, 38000, 75000, 52000, 68000, 30000, 90000, 82000, 100000, 36000, 50000, 72000, 85000, 59000, 42000, 77000, 92000, 54000, 69000]
}

dataset = pd.DataFrame(data)

X = dataset[['Year of Experiences']]
y = dataset['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Create linear regression model
model_linear = LinearRegression()

# Train the model using the training sets
model_linear.fit(X_train, y_train)

# testing the model
y_predict=model_linear.predict(X_test)

# Calculate MAE
mae_linear = mean_absolute_error(y_test, y_predict)
print("Mean Absolute Error (MAE):", mae_linear)


Mean Absolute Error (MAE): 973.7890864500296


# Exercise: predict the Co2 emission on the FuelConsumption dataset  

In [18]:
# do coding here
df=pd.read_csv('FuelConsumptionCo2.csv')
df.head(2)

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MODELYEAR                 1067 non-null   int64  
 1   MAKE                      1067 non-null   object 
 2   MODEL                     1067 non-null   object 
 3   VEHICLECLASS              1067 non-null   object 
 4   ENGINESIZE                1067 non-null   float64
 5   CYLINDERS                 1067 non-null   int64  
 6   TRANSMISSION              1067 non-null   object 
 7   FUELTYPE                  1067 non-null   object 
 8   FUELCONSUMPTION_CITY      1067 non-null   float64
 9   FUELCONSUMPTION_HWY       1067 non-null   float64
 10  FUELCONSUMPTION_COMB      1067 non-null   float64
 11  FUELCONSUMPTION_COMB_MPG  1067 non-null   int64  
 12  CO2EMISSIONS              1067 non-null   int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 108.5+ KB


In [20]:
df.describe()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
count,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0
mean,2014.0,3.346298,5.794752,13.296532,9.474602,11.580881,26.441425,256.228679
std,0.0,1.415895,1.797447,4.101253,2.79451,3.485595,7.468702,63.372304
min,2014.0,1.0,3.0,4.6,4.9,4.7,11.0,108.0
25%,2014.0,2.0,4.0,10.25,7.5,9.0,21.0,207.0
50%,2014.0,3.4,6.0,12.6,8.8,10.9,26.0,251.0
75%,2014.0,4.3,8.0,15.55,10.85,13.35,31.0,294.0
max,2014.0,8.4,12.0,30.2,20.5,25.8,60.0,488.0


In [21]:
X=df[['FUELCONSUMPTION_COMB_MPG']]
Y=df['CO2EMISSIONS']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Create linear regression model
model_linear = LinearRegression()

# Train the model using the training sets
model_linear.fit(X_train, y_train)

# testing the model
y_predict=model_linear.predict(X_test)

# Calculate MAE
mae_linear = mean_absolute_error(y_test, y_predict)
print("Mean Absolute Error (MAE):", mae_linear)


Mean Absolute Error (MAE): 17.73061220485377


# Multiple Linear Regression

In [23]:
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(0)
num_samples = 100
years_of_experience = np.random.randint(1, 20, size=num_samples)
age = np.random.randint(22, 65, size=num_samples)
education_level = np.random.randint(1, 5, size=num_samples)  # Assume 1=High School, 2=Bachelor's, 3=Master's, 4=PhD
salary = 30000 + (years_of_experience * 2000) + (age * 100) + (education_level * 5000) + np.random.normal(scale=10000, size=num_samples)

# Create DataFrame
data = {
    "Years of Experience": years_of_experience,
    "Age": age,
    "Education Level": education_level,
    "Salary": salary
}
dataset = pd.DataFrame(data)

print(dataset.head())


   Years of Experience  Age  Education Level        Salary
0                   13   45                4  82454.993353
1                   16   24                2  69683.956665
2                    1   56                4  75723.546515
3                    4   57                2  30758.625212
4                    4   52                4  69712.093458


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming you have loaded your dataset into a pandas DataFrame named 'dataset'
# Splitting the dataset into features (X) and target variable (y)
X = dataset[['Years of Experience', 'Age', 'Education Level']]  # Independent variables
y = dataset['Salary']  # Dependent variable

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the MLR model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 70165147.63207223


# **Exercise**: Predict the student performance using regression model and then evaluate the model.

In [25]:
import pandas as pd
df=pd.read_csv('Student_Performance.csv')
df.head(10)

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
5,3,78,No,9,6,61.0
6,7,73,Yes,5,6,63.0
7,8,45,Yes,4,6,42.0
8,5,77,No,8,2,61.0
9,4,89,No,4,0,69.0


In [26]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [28]:
from sklearn.preprocessing import LabelEncoder

In [29]:
encoder=LabelEncoder()

In [30]:
df['Extracurricular Activities']=encoder.fit_transform(df['Extracurricular Activities'])

In [31]:
df


Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int32  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int32(1), int64(4)
memory usage: 429.8 KB


In [33]:
X=df[['Previous Scores','Sample Question Papers Practiced','Sleep Hours','Hours Studied','Extracurricular Activities']]
Y=df['Performance Index']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Create linear regression model
model_linear = LinearRegression()

# Train the model using the training sets
model_linear.fit(X_train, y_train)

# testing the model
y_predict=model_linear.predict(X_test)

# Calculate MAE
mae_linear = mean_absolute_error(y_test, y_predict)
print("Mean Absolute Error (MAE):", mae_linear)

Mean Absolute Error (MAE): 1.611121346312304
