In [1]:
import numpy as np
import pandas as pd

Creating Dataset

In [2]:
from numpy.random import rand

df  = pd.DataFrame(rand(50 , 2)*10, columns = 'Hours_studied Hours_slept'.split())
df.head(10)  # Shows first few entries of dataframe. Default value = 5

Unnamed: 0,Hours_studied,Hours_slept
0,4.008298,9.437941
1,1.216888,0.494988
2,4.035245,0.014966
3,6.020175,2.264227
4,4.666995,3.905616
5,2.150499,9.151371
6,7.810582,6.425245
7,7.733035,0.487323
8,1.900401,1.740458
9,3.629801,1.835146


In [3]:
from random import choice

IQ = []
Location = []
Stress = []

for i in range(50):
    IQ.append(np.random.randint(90,140))
    Location.append(choice(["Pune","Delhi","Bangalore","Assam"]))
    Stress.append(choice(["yes","no"]))

In [4]:
print(IQ)

[93, 129, 116, 122, 97, 96, 123, 137, 117, 98, 129, 126, 126, 127, 134, 134, 122, 95, 97, 114, 95, 107, 99, 102, 131, 101, 125, 137, 97, 97, 102, 112, 118, 128, 119, 120, 95, 118, 100, 103, 137, 132, 99, 98, 109, 126, 93, 96, 116, 108]


In [5]:
df['IQ'] = IQ   # Adding column 'IQ' with values in list IQ
df.head()

Unnamed: 0,Hours_studied,Hours_slept,IQ
0,4.008298,9.437941,93
1,1.216888,0.494988,129
2,4.035245,0.014966,116
3,6.020175,2.264227,122
4,4.666995,3.905616,97


In [6]:
df['Location'] = Location
df['Stressed'] = Stress
df.head()

Unnamed: 0,Hours_studied,Hours_slept,IQ,Location,Stressed
0,4.008298,9.437941,93,Pune,no
1,1.216888,0.494988,129,Pune,yes
2,4.035245,0.014966,116,Pune,yes
3,6.020175,2.264227,122,Delhi,no
4,4.666995,3.905616,97,Pune,yes


In [7]:
df['Marks'] = ((46*df["Hours_studied"]) + (63* df['Hours_slept']) + 0.4 * df['IQ'] + 11.3)
df["Marks"] = df["Marks"] + choice([1250, -1400, 2319, 2831, -2327, -2910])
# Equation to calculate marks

df.head(10)

Unnamed: 0,Hours_studied,Hours_slept,IQ,Location,Stressed,Marks
0,4.008298,9.437941,93,Pune,no,-2082.527995
1,1.216888,0.494988,129,Pune,yes,-2759.93888
2,4.035245,0.014966,116,Pune,yes,-2665.735894
3,6.020175,2.264227,122,Delhi,no,-2430.325675
4,4.666995,3.905616,97,Pune,yes,-2399.164449
5,2.150499,9.151371,96,Pune,yes,-2184.840694
6,7.810582,6.425245,123,Bangalore,yes,-2085.422803
7,7.733035,0.487323,137,Bangalore,no,-2457.479067
8,1.900401,1.740458,117,Pune,yes,-2654.832724
9,3.629801,1.835146,98,Pune,no,-2576.914933


In [8]:
df.to_csv('PredictMarks.csv', index = False)  # Storing dataframe as a .csv file

Importing the dataset

In [9]:
dataset = pd.read_csv("PredictMarks.csv")

x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]     # Distinguishing between features and labels

Encoding categorical data

In [10]:
from sklearn.compose import ColumnTransformer
# Allows you to perform transformations on different columns separately, 
# then combines the individual outputs into a single feature space
from sklearn.preprocessing import OneHotEncoder
# Creates binary columns for each entry of categorical data

In [11]:
obj = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3,4])],  remainder='passthrough')

print(type(obj))

<class 'sklearn.compose._column_transformer.ColumnTransformer'>


In [12]:
x = np.array(obj.fit_transform(x))
print(x)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  1.00000000e+00 0.00000000e+00 4.00829809e+00 9.43794116e+00
  9.30000000e+01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 1.21688817e+00 4.94988318e-01
  1.29000000e+02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 4.03524479e+00 1.49658090e-02
  1.16000000e+02]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 6.02017486e+00 2.26422669e+00
  1.22000000e+02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 4.66699495e+00 3.90561560e+00
  9.70000000e+01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 2.15049911e+00 9.15137058e+00
  9.60000000e+01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 7.81058184e+00 6.42524496e+00
  1.23000000e+02]
 [0.00

Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state = 0)
print(x_test)
print(y_test)

[[0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 5.58918877e+00 5.09915957e+00
  9.70000000e+01]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 6.61547622e+00 4.58338269e+00
  1.26000000e+02]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 2.53755649e+00 8.14519636e+00
  1.29000000e+02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 8.55644589e+00 5.04112101e+00
  1.32000000e+02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00
  0.00000000e+00 1.00000000e+00 4.03524479e+00 1.49658090e-02
  1.16000000e+02]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 1.00000000e+00 9.45324001e+00 8.56941658e+00
  1.37000000e+02]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00
  1.00000000e+00 0.00000000e+00 4.90193358e+00 6.66041975e-01
  1.00000000e+02]
 [0.00

Training the Multiple Linear Regression model on the Training set

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
regObj = LinearRegression()
regObj.fit(x_train, y_train)

LinearRegression()

Predicting the Test set results

In [16]:
y_pred = regObj.predict(x_test)

#np.set_printoptions(precision=2)
y_pred

array([-2281.55026339, -2255.23498409, -2217.22503067, -2134.71286516,
       -2665.73589377, -1869.17771515, -2591.25041109, -2282.78929268,
       -2284.24349246, -2399.16444941, -2212.88478842, -2346.8854069 ,
       -2134.06803879, -2715.5815933 , -1890.44690879])

In [17]:
print(y_test)

28   -2281.550263
11   -2255.234984
10   -2217.225031
41   -2134.712865
2    -2665.735894
27   -1869.177715
38   -2591.250411
31   -2282.789293
22   -2284.243492
4    -2399.164449
33   -2212.884788
35   -2346.885407
26   -2134.068039
34   -2715.581593
18   -1890.446909
Name: Marks, dtype: float64


In [18]:
dic = {'Prediction': y_pred, 'Actual': y_test}
dic

{'Prediction': array([-2281.55026339, -2255.23498409, -2217.22503067, -2134.71286516,
        -2665.73589377, -1869.17771515, -2591.25041109, -2282.78929268,
        -2284.24349246, -2399.16444941, -2212.88478842, -2346.8854069 ,
        -2134.06803879, -2715.5815933 , -1890.44690879]),
 'Actual': 28   -2281.550263
 11   -2255.234984
 10   -2217.225031
 41   -2134.712865
 2    -2665.735894
 27   -1869.177715
 38   -2591.250411
 31   -2282.789293
 22   -2284.243492
 4    -2399.164449
 33   -2212.884788
 35   -2346.885407
 26   -2134.068039
 34   -2715.581593
 18   -1890.446909
 Name: Marks, dtype: float64}

In [19]:
tempDF = pd.DataFrame(data = dic)

In [20]:
tempDF

Unnamed: 0,Prediction,Actual
28,-2281.550263,-2281.550263
11,-2255.234984,-2255.234984
10,-2217.225031,-2217.225031
41,-2134.712865,-2134.712865
2,-2665.735894,-2665.735894
27,-1869.177715,-1869.177715
38,-2591.250411,-2591.250411
31,-2282.789293,-2282.789293
22,-2284.243492,-2284.243492
4,-2399.164449,-2399.164449
