In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("wine_data.csv", header = None , usecols = [0,1,2])
df.columns = ['Class label', 'Alcohol', 'Malix acid']
df.head(3)

Unnamed: 0,Class label,Alcohol,Malix acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36


In [3]:
X = df[['Alcohol', 'Malix acid']]
X

Unnamed: 0,Alcohol,Malix acid
0,14.23,1.71
1,13.20,1.78
2,13.16,2.36
3,14.37,1.95
4,13.24,2.59
...,...,...
173,13.71,5.65
174,13.40,3.91
175,13.27,4.28
176,13.17,2.59


In [4]:
# Machine Learning -- It is all about Training the data and testing the model
# Types of ML -> Supervise and Unsupervise

# Supervise --> When input and output is known labeled data
# Types of Supervise --> Regression and Classification

# Unsupervise --> When output is unknown

In [5]:
# Steps of ML

# Cleaning 
# Encoding
# ZScore

# 1. Seperate input and output variable 
# Syntex --> x = df.drop(target_col, axis = 1) # x denotes "Input".
# y = df[target_col]  # y denotes "Output".

# Scaling


# 2. Seperate training and testing data
# (library) --> from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 42)

# import your algo
# from sklearn.linear_model import LinearRegression 

# Make the object
# Lr = LinearRegression()

# trained the model
# lr.fit(x_train, y_train)


In [6]:
# Now working on data ------------:


In [7]:
df = pd.read_excel("Data_Train.xlsx")
df.head(3)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882


In [8]:
df['Journey_Day'] = pd.to_datetime(df["Date_of_Journey"]).dt.day

In [9]:
df['Journey_Month'] = pd.to_datetime(df["Date_of_Journey"]).dt.month

In [10]:
df.drop("Date_of_Journey", axis = 1, inplace = True)

In [11]:
df.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month
0,IndiGo,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662,1,5
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882,9,6


In [12]:
df['Dep_Hour'] = pd.to_datetime(df['Dep_Time']).dt.hour

In [13]:
df['Dep_Min'] = pd.to_datetime(df['Dep_Time']).dt.minute

In [14]:
df.drop("Dep_Time", axis = 1, inplace = True)

In [15]:
df.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Arrival_Time,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,01:10 22 Mar,2h 50m,non-stop,No info,3897,24,3,22,20
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,13:15,7h 25m,2 stops,No info,7662,1,5,5,50
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,04:25 10 Jun,19h,2 stops,No info,13882,9,6,9,25


In [16]:
df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time']).dt.hour
df['Arrival_Min'] = pd.to_datetime(df['Arrival_Time']).dt.minute

In [17]:
df.drop("Arrival_Time", axis = 1, inplace = True)

In [18]:
df.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,2h 50m,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,7h 25m,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,19h,2 stops,No info,13882,9,6,9,25,4,25


In [19]:
df['Duration'] = df['Duration'].str.replace('h', '*60').str.replace(" "," + ").str.replace("m","*1").apply(eval)

In [20]:
df.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min
0,IndiGo,Banglore,New Delhi,BLR → DEL,170,non-stop,No info,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,CCU → IXR → BBI → BLR,445,2 stops,No info,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,DEL → LKO → BOM → COK,1140,2 stops,No info,13882,9,6,9,25,4,25


In [21]:
df.isna().sum()

Airline            0
Source             0
Destination        0
Route              1
Duration           0
Total_Stops        1
Additional_Info    0
Price              0
Journey_Day        0
Journey_Month      0
Dep_Hour           0
Dep_Min            0
Arrival_Hour       0
Arrival_Min        0
dtype: int64

In [22]:
df.dropna(inplace=True)

In [23]:
df.isna().sum()

Airline            0
Source             0
Destination        0
Route              0
Duration           0
Total_Stops        0
Additional_Info    0
Price              0
Journey_Day        0
Journey_Month      0
Dep_Hour           0
Dep_Min            0
Arrival_Hour       0
Arrival_Min        0
dtype: int64

In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
categorical = df.select_dtypes(include = "object")

In [26]:
le = LabelEncoder()

In [27]:
for i in categorical:
    df[i] = le.fit_transform(df[i])

In [28]:
df.head(3)

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min
0,3,0,5,18,170,4,8,3897,24,3,22,20,1,10
1,1,3,0,84,445,1,8,7662,1,5,5,50,13,15
2,4,2,1,118,1140,1,8,13882,9,6,9,25,4,25


In [29]:
from scipy.stats import zscore
z = np.abs(zscore(df))
df1 = df[(z<3). all(axis = 1)]


In [30]:
df.shape

(10682, 14)

In [31]:
df1.shape

(10512, 14)

In [32]:
df = df1.copy()

In [33]:
# Using steps of ML 
X = df.drop("Price", axis = 1) # X is input
y = df['Price'] # y is output

In [34]:
X

Unnamed: 0,Airline,Source,Destination,Route,Duration,Total_Stops,Additional_Info,Journey_Day,Journey_Month,Dep_Hour,Dep_Min,Arrival_Hour,Arrival_Min
0,3,0,5,18,170,4,8,24,3,22,20,1,10
1,1,3,0,84,445,1,8,1,5,5,50,13,15
2,4,2,1,118,1140,1,8,9,6,9,25,4,25
3,3,3,0,91,325,0,8,12,5,18,5,23,30
4,3,0,5,29,285,0,8,1,3,16,50,21,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,0,3,0,64,150,4,8,9,4,19,55,22,25
10679,1,3,0,64,155,4,8,27,4,20,45,23,20
10680,4,0,2,18,180,4,8,27,4,8,20,11,20
10681,10,0,5,18,160,4,8,1,3,11,30,14,10


In [35]:
y

0         3897
1         7662
2        13882
3         6218
4        13302
         ...  
10678     4107
10679     4145
10680     7229
10681    12648
10682    11753
Name: Price, Length: 10512, dtype: int64

In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
sc = StandardScaler()

In [38]:
X_sc = sc.fit_transform(X)

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.2, random_state = 42)

In [41]:
from sklearn.linear_model import LinearRegression

In [42]:
lr = LinearRegression()

In [43]:
lr.fit(X_train, y_train)

In [44]:
y_pred = lr.predict(X_test)

In [45]:
cal = pd.DataFrame(np.c_[y_test, y_pred], columns = ['Actual_Price', 'Predicted_Price'])

In [46]:
cal

Unnamed: 0,Actual_Price,Predicted_Price
0,5069.0,9656.481337
1,10953.0,11852.320160
2,10919.0,10081.614948
3,13941.0,11932.882883
4,12681.0,11128.300192
...,...,...
2098,8040.0,4618.718665
2099,10262.0,11345.984798
2100,10651.0,10912.596574
2101,10844.0,11039.635689


In [47]:
##########################################

In [48]:
# working towards ---: Errors handle 

In [49]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [50]:
print("Mean absolute error: ", mean_absolute_error(y_test, y_pred))
print("Mean squared error: ", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print('\n')
print('R2 Score: ')
print(r2_score(y_test, y_pred))

Mean absolute error:  2306.1845101194467
Mean squared error:  8606315.78928081
Root Mean Squared Error:  2933.6522952253235


R2 Score: 
0.47914430231873595


In [54]:
# These are only examples right now for accuracy check. ==> DT and RF are both examples for accuracy check.
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
print('R2 Score: ', r2_score(y_test, y_pred))
print('Mean absolute error: ', mean_absolute_error(y_test, y_pred))
print('Mean squared error: ', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(y_test, y_pred)))


R2 Score:  0.8927248590056651
Mean absolute error:  602.5058250118877
Mean squared error:  1772551.8677187352
Root Mean Squared Error:  1331.3721747575826


In [55]:
from sklearn.ensemble import RandomForestRegressor
rfg = RandomForestRegressor()
rfg.fit(X_train, y_train)
y_pred = rfg.predict(X_test)
print('R2 score', r2_score(y_test, y_pred))
print('Mean absolute error: ', mean_absolute_error(y_test, y_pred))
print('Mean squared error: ', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: ', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 score 0.92917527583144
Mean absolute error:  567.763823004174
Mean squared error:  1170266.4377040993
Root Mean Squared Error:  1081.788536500595
