In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor

## Apply ML Regression techniques on "50_Startups.csv" dataset

In [10]:
df=pd.read_csv('Assignment Datasets/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
x=df[['R&D Spend','R&D Spend','Marketing Spend']]
x

Unnamed: 0,R&D Spend,R&D Spend.1,Marketing Spend
0,165349.2,165349.2,471784.1
1,162597.7,162597.7,443898.53
2,153441.51,153441.51,407934.54
3,144372.41,144372.41,383199.62
4,142107.34,142107.34,366168.42
5,131876.9,131876.9,362861.36
6,134615.46,134615.46,127716.82
7,130298.13,130298.13,323876.68
8,120542.52,120542.52,311613.29
9,123334.88,123334.88,304981.62


In [12]:
y=df['Profit']
y

0     192261.83
1     191792.06
2     191050.39
3     182901.99
4     166187.94
5     156991.12
6     156122.51
7     155752.60
8     152211.77
9     149759.96
10    146121.95
11    144259.40
12    141585.52
13    134307.35
14    132602.65
15    129917.04
16    126992.93
17    125370.37
18    124266.90
19    122776.86
20    118474.03
21    111313.02
22    110352.25
23    108733.99
24    108552.04
25    107404.34
26    105733.54
27    105008.31
28    103282.38
29    101004.64
30     99937.59
31     97483.56
32     97427.84
33     96778.92
34     96712.80
35     96479.51
36     90708.19
37     89949.14
38     81229.06
39     81005.76
40     78239.91
41     77798.83
42     71498.49
43     69758.98
44     65200.33
45     64926.08
46     49490.75
47     42559.73
48     35673.41
49     14681.40
Name: Profit, dtype: float64

In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [14]:
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [15]:
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'XGBR':XGBRFRegressor(),
    'RF':RandomForestRegressor()
}

In [16]:
for name,model in models.items():
    print(f'using: {name}')
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*30,'\n')

using: LR
Train Accuracy: 0.9513173080273705
Test Accuracy: 0.9363797078962905
------------------------------ 

using: KNN
Train Accuracy: 0.9437404579406781
Test Accuracy: 0.8763621693210728
------------------------------ 

using: SVR
Train Accuracy: -0.0051619198567893765
Test Accuracy: 0.0005324247535462012
------------------------------ 

using: DT
Train Accuracy: 1.0
Test Accuracy: 0.9241117993215336
------------------------------ 

using: XGBR
Train Accuracy: 0.9957490254758548
Test Accuracy: 0.933808153674517
------------------------------ 

using: RF
Train Accuracy: 0.9917262835360013
Test Accuracy: 0.9469771665976415
------------------------------ 



### best model for this data is RandomForest and LinearRegression and XGBRFRegressor 

## Apply ML Regression techniques on "House prices.csv" dataset

In [19]:
df=pd.read_csv('Assignment Datasets/House prices.csv')
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [20]:
df=df.drop(['statezip','country'],axis=1)

In [21]:
df=pd.get_dummies(df,columns=['city','street'],drop_first=True)
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,street_Indian Trail,street_Nuthatch Trail,street_SE 170th Pl,street_SE 21st Ct,street_Schmitz Park to Alki Trail,street_Shangri-La Way NW,street_Sunrise Loop Trail,street_Tolt Pipeline Trail,street_Trossachs Blvd SE,street_Valley View Trail
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,...,0,0,0,0,0,0,0,0,0,0
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,...,0,0,0,0,0,0,0,0,0,0


In [22]:
x=df[df.columns.difference(['date','price'])]
x

Unnamed: 0,bathrooms,bedrooms,city_Auburn,city_Beaux Arts Village,city_Bellevue,city_Black Diamond,city_Bothell,city_Burien,city_Carnation,city_Clyde Hill,...,street_Schmitz Park to Alki Trail,street_Shangri-La Way NW,street_Sunrise Loop Trail,street_Tolt Pipeline Trail,street_Trossachs Blvd SE,street_Valley View Trail,view,waterfront,yr_built,yr_renovated
0,1.50,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1955,2005
1,2.50,5.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,0,1921,0
2,2.00,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1966,0
3,2.25,3.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1963,0
4,2.50,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1976,1992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,1.75,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1954,1979
4596,2.50,3.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1983,2009
4597,2.50,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2009,0
4598,2.00,4.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1974,0


In [23]:
y=df['price']
y

0       3.130000e+05
1       2.384000e+06
2       3.420000e+05
3       4.200000e+05
4       5.500000e+05
            ...     
4595    3.081667e+05
4596    5.343333e+05
4597    4.169042e+05
4598    2.034000e+05
4599    2.206000e+05
Name: price, Length: 4600, dtype: float64

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [25]:
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [26]:
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'XGBR':XGBRFRegressor(),
    'RF':RandomForestRegressor()
}

In [27]:
for name,model in models.items():
    print(f'using: {name}')
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*30,'\n')

using: LR
Train Accuracy: 0.9980980011448445
Test Accuracy: -2.3937650535708113e+25
------------------------------ 

using: KNN
Train Accuracy: 0.3245604861542849
Test Accuracy: 0.009190296075324622
------------------------------ 

using: SVR
Train Accuracy: -0.023819579381186173
Test Accuracy: -0.04789978315084653
------------------------------ 

using: DT
Train Accuracy: 0.9999985770795974
Test Accuracy: 0.37902960630909377
------------------------------ 

using: XGBR
Train Accuracy: 0.8290074062647983
Test Accuracy: 0.5530374339542591
------------------------------ 

using: RF
Train Accuracy: 0.9046815013975181
Test Accuracy: 0.6247626056174296
------------------------------ 



### best model for this data is RandomForest

## Apply ML Regression techniques on "data.csv" dataset

In [35]:
df=pd.read_csv('Assignment Datasets/data.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [36]:
df=df.drop(['User_ID','Product_ID'],axis=1)

In [37]:
df['Product_Category_1'].fillna(df['Product_Category_1'].mode()[0],inplace=True)
df['Product_Category_2'].fillna(df['Product_Category_2'].mode()[0],inplace=True)
df['Product_Category_3'].fillna(df['Product_Category_3'].mode()[0],inplace=True)

In [38]:
df=pd.get_dummies(df,columns=['Gender','Age','City_Category','Stay_In_Current_City_Years'],drop_first=True)
df.head()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,10,0,3,8.0,16.0,8370,0,0,0,0,0,0,0,0,0,0,1,0,0
1,10,0,1,6.0,14.0,15200,0,0,0,0,0,0,0,0,0,0,1,0,0
2,10,0,12,8.0,16.0,1422,0,0,0,0,0,0,0,0,0,0,1,0,0
3,10,0,12,14.0,16.0,1057,0,0,0,0,0,0,0,0,0,0,1,0,0
4,16,0,8,8.0,16.0,7969,1,0,0,0,0,0,1,0,1,0,0,0,1


In [39]:
x=df[df.columns.difference(['Purchase'])]
x

Unnamed: 0,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Gender_M,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,0,0,0,0,0,0,0,0,0,0,10,3,8.0,16.0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,10,1,6.0,14.0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,10,12,8.0,16.0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,10,12,14.0,16.0,0,1,0,0
4,0,0,0,0,0,1,0,1,1,0,16,8,8.0,16.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22889,0,0,0,0,1,0,1,0,1,1,13,20,8.0,16.0,1,0,0,0
22890,0,1,0,0,0,0,0,1,0,0,1,20,8.0,16.0,0,0,1,0
22891,0,1,0,0,0,0,1,0,0,1,15,20,8.0,16.0,0,0,0,1
22892,0,0,0,0,0,1,0,1,0,0,1,20,8.0,16.0,0,1,0,0


In [40]:
y=df['Purchase']
y

0         8370
1        15200
2         1422
3         1057
4         7969
         ...  
22889      368
22890      371
22891      137
22892      365
22893      490
Name: Purchase, Length: 22894, dtype: int64

In [41]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [42]:
scaler=StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [43]:
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'XGBR':XGBRFRegressor(),
    'RF':RandomForestRegressor()
}

In [44]:
for name,model in models.items():
    print(f'using: {name}')
    model.fit(x_train,y_train)
    print(f'Train Accuracy: {model.score(x_train,y_train)}')
    print(f'Test Accuracy: {model.score(x_test,y_test)}')
    print('-'*30,'\n')

using: LR
Train Accuracy: 0.42554378585123154
Test Accuracy: 0.4368029404332664
------------------------------ 

using: KNN
Train Accuracy: 0.5971303440228155
Test Accuracy: 0.38264628022257685
------------------------------ 

using: SVR
Train Accuracy: 0.03155524743337568
Test Accuracy: 0.029559380511341438
------------------------------ 

using: DT
Train Accuracy: 0.9592834347636732
Test Accuracy: 0.5837389687373487
------------------------------ 

using: XGBR
Train Accuracy: 0.6851894318463361
Test Accuracy: 0.6804849436798959
------------------------------ 

using: RF
Train Accuracy: 0.9349964584550101
Test Accuracy: 0.7259718553718291
------------------------------ 



### best model for this data is RandomForest

# Thank you
