In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from datetime import datetime
from sklearn.impute import KNNImputer

In [3]:
startup_df = pd.read_csv('Assignment Datasets/50_Startups.csv')

In [4]:
startup_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
startup_df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
startup_df = pd.get_dummies(startup_df, columns=['State'])

In [9]:
x = startup_df.drop(['Profit'], axis=1)
y = startup_df['Profit']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=111)

In [11]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [12]:
def mlmodel(model):
    reg = model
    reg.fit(x_train, y_train)
    y_pred = reg.predict(x_test)
    print(f'Train Accuracy : {reg.score(x_train, y_train)}')
    print(f'Test Accuracy : {reg.score(x_test, y_test)}')

In [13]:
models = {
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'SVR':SVR(),
    'DT':DecisionTreeRegressor(),
    'RF':RandomForestRegressor(),
    'XGBR':XGBRegressor()
}
for name, model in models.items():
    print(f'using : {name}')
    mlmodel(model)

using : LR
Train Accuracy : 0.9545588933312086
Test Accuracy : 0.9216934570477677
using : KNN
Train Accuracy : 0.6833643877713039
Test Accuracy : 0.47678712017477254
using : SVR
Train Accuracy : -0.01378866887125585
Test Accuracy : -0.024150876054919834
using : DT
Train Accuracy : 1.0
Test Accuracy : 0.8798796972219672
using : RF
Train Accuracy : 0.9862533598579946
Test Accuracy : 0.9347233333574101
using : XGBR
Train Accuracy : 0.9999999999998358
Test Accuracy : 0.8856064986468026


In [23]:
houseprz_df = pd.read_csv('Assignment Datasets/House prices.csv')

In [24]:
houseprz_df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [25]:
houseprz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [26]:
houseprz_df['date'] = pd.to_datetime(houseprz_df['date'])

In [27]:
houseprz_df['Year'] = houseprz_df['date'].dt.year
houseprz_df['month'] = houseprz_df['date'].dt.month
houseprz_df['day'] = houseprz_df['date'].dt.day

In [28]:
houseprz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           4600 non-null   datetime64[ns]
 1   price          4600 non-null   float64       
 2   bedrooms       4600 non-null   float64       
 3   bathrooms      4600 non-null   float64       
 4   sqft_living    4600 non-null   int64         
 5   sqft_lot       4600 non-null   int64         
 6   floors         4600 non-null   float64       
 7   waterfront     4600 non-null   int64         
 8   view           4600 non-null   int64         
 9   condition      4600 non-null   int64         
 10  sqft_above     4600 non-null   int64         
 11  sqft_basement  4600 non-null   int64         
 12  yr_built       4600 non-null   int64         
 13  yr_renovated   4600 non-null   int64         
 14  street         4600 non-null   object        
 15  city           4600 n

In [29]:
houseprz_df = pd.get_dummies(houseprz_df, columns=['city','street'])
x = houseprz_df.drop(['statezip', 'date', 'country'], axis=1)
y = houseprz_df['price']

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=111)

In [31]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [32]:
for name, model in models.items():
    print(f'using : {name}')
    mlmodel(model)

using : LR
Train Accuracy : 1.0
Test Accuracy : -20.776556236069823
using : KNN
Train Accuracy : 0.3308424118138439
Test Accuracy : -0.09308999163503962
using : SVR
Train Accuracy : -0.022131119623616247
Test Accuracy : -0.042479517596940575
using : DT
Train Accuracy : 1.0
Test Accuracy : 0.9997587564167838
using : RF
Train Accuracy : 0.9578381567864668
Test Accuracy : 0.999712002642865
using : XGBR
Train Accuracy : 0.9999755271764816
Test Accuracy : 0.9995805110799685


In [33]:
data_df = pd.read_csv('Assignment Datasets/data.csv')

In [34]:
data_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [35]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     22894 non-null  int64  
 1   Product_ID                  22894 non-null  object 
 2   Gender                      22894 non-null  object 
 3   Age                         22894 non-null  object 
 4   Occupation                  22894 non-null  int64  
 5   City_Category               22894 non-null  object 
 6   Stay_In_Current_City_Years  22894 non-null  object 
 7   Marital_Status              22894 non-null  int64  
 8   Product_Category_1          22894 non-null  int64  
 9   Product_Category_2          12804 non-null  float64
 10  Product_Category_3          5576 non-null   float64
 11  Purchase                    22894 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 2.1+ MB


In [36]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     22894 non-null  int64  
 1   Product_ID                  22894 non-null  object 
 2   Gender                      22894 non-null  object 
 3   Age                         22894 non-null  object 
 4   Occupation                  22894 non-null  int64  
 5   City_Category               22894 non-null  object 
 6   Stay_In_Current_City_Years  22894 non-null  object 
 7   Marital_Status              22894 non-null  int64  
 8   Product_Category_1          22894 non-null  int64  
 9   Product_Category_2          12804 non-null  float64
 10  Product_Category_3          5576 non-null   float64
 11  Purchase                    22894 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 2.1+ MB


In [37]:
knn_imp = KNNImputer()

In [38]:
data_df['Product_Category_2'] = knn_imp.fit_transform(data_df[['Product_Category_2']])
data_df['Product_Category_3'] = knn_imp.fit_transform(data_df[['Product_Category_3']])
data_df['Purchase'] = knn_imp.fit_transform(data_df[['Purchase']])

In [39]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     22894 non-null  int64  
 1   Product_ID                  22894 non-null  object 
 2   Gender                      22894 non-null  object 
 3   Age                         22894 non-null  object 
 4   Occupation                  22894 non-null  int64  
 5   City_Category               22894 non-null  object 
 6   Stay_In_Current_City_Years  22894 non-null  object 
 7   Marital_Status              22894 non-null  int64  
 8   Product_Category_1          22894 non-null  int64  
 9   Product_Category_2          22894 non-null  float64
 10  Product_Category_3          22894 non-null  float64
 11  Purchase                    22894 non-null  float64
dtypes: float64(3), int64(4), object(5)
memory usage: 2.1+ MB


In [40]:
data_df['Stay_In_Current_City_Years'].value_counts()

1     8134
2     4291
3     4024
4+    3626
0     2819
Name: Stay_In_Current_City_Years, dtype: int64

In [41]:
data_df['Stay_In_Current_City_Years'] = data_df['Stay_In_Current_City_Years'].str.replace('+', '')
data_df['Stay_In_Current_City_Years'] = pd.to_numeric(data_df['Stay_In_Current_City_Years'])

  data_df['Stay_In_Current_City_Years'] = data_df['Stay_In_Current_City_Years'].str.replace('+', '')


In [42]:
data_df['Stay_In_Current_City_Years'].value_counts()

1    8134
2    4291
3    4024
4    3626
0    2819
Name: Stay_In_Current_City_Years, dtype: int64

In [43]:
data_df = pd.get_dummies(data_df, columns=['Gender', 'Age', 'City_Category'])

In [44]:
x = data_df.drop(['Purchase', 'User_ID', 'Product_ID'], axis=1)
y = data_df['Purchase']

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=111)

In [46]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [47]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [48]:
for name, model in models.items():
    print(f'using : {name}')
    mlmodel(model)

using : LR
Train Accuracy : 0.4227068618525721
Test Accuracy : 0.43881887689485977
using : KNN
Train Accuracy : 0.5978535416273507
Test Accuracy : 0.41872629682675655
using : SVR
Train Accuracy : 0.02800883137836141
Test Accuracy : 0.028779963418583354
using : DT
Train Accuracy : 0.9678959871992392
Test Accuracy : 0.6006845063754818
using : RF
Train Accuracy : 0.9405061154392004
Test Accuracy : 0.74526758038942
using : XGBR
Train Accuracy : 0.8561807450430836
Test Accuracy : 0.7615643022519599
