# Assignment - Machine Learning - Supervised - Regression

In [2]:
# Used libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor

In [42]:
# models used to try their performances
models={
    'LR':LinearRegression(),
    'KNN':KNeighborsRegressor(),
    'DT':DecisionTreeRegressor(),
    'RF':RandomForestRegressor(),
    'SVR':SVR(),
    'XGBR':XGBRFRegressor()
}

## 1) 

In [8]:
# Reading the DataFrame
df_strp=pd.read_csv('Assignment Datasets/50_Startups.csv')
df_strp.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
7,130298.13,145530.06,323876.68,Florida,155752.6
12,93863.75,127320.38,249839.44,Florida,141585.52
22,73994.56,122782.75,303319.26,Florida,110352.25
35,46014.02,85047.44,205517.64,New York,96479.51
42,23640.93,96189.63,148001.11,California,71498.49


In [4]:
df_strp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [5]:
# Unique values in 'State' column
df_strp.State.value_counts()

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [9]:
# Convert 'State' feature into dummy variables (numbers) for ML models
df_strp=pd.get_dummies(df_strp, columns=['State'], drop_first=True)
df_strp.sample()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
12,93863.75,127320.38,249839.44,141585.52,1,0


In [10]:
# Split independent data, and the dependent data to be predicted i.e. 'Profit'
df_strp.Profit=df_strp.Profit.apply(lambda x: round(x))

X=df_strp.drop(columns=['Profit'])
y=df_strp.Profit
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [11]:
# Scaling down data before entering the model
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [12]:
# Apply each model on our data, in addition, test each for accuracy on both train and test parts
for name, model in models.items():
    print(f'Using: {name}')
    model.fit(X_train, y_train)
    print(f'Train Accuracy: {model.score(X_train, y_train)}')
    print(f'Test Accuracy: {model.score(X_test, y_test)}')
    print('-'*30,'\n')

Using: LR
Train Accuracy: 0.9437660117092603
Test Accuracy: 0.9589696620285054
------------------------------ 

Using: KNN
Train Accuracy: 0.7237998350161442
Test Accuracy: 0.613132378244589
------------------------------ 

Using: DT
Train Accuracy: 1.0
Test Accuracy: 0.8829799895644633
------------------------------ 

Using: RF
Train Accuracy: 0.9892350642590136
Test Accuracy: 0.922476716735192
------------------------------ 

Using: SVR
Train Accuracy: -0.003901199939421174
Test Accuracy: -0.34343971425055275
------------------------------ 

Using: XGBR
Train Accuracy: 0.9963359312269129
Test Accuracy: 0.9083137709398517
------------------------------ 



Based on above results, **Linear Regression** algorithm showed best testing accuracy, also being accurate in the train data without overfitting.
<br /> <hr>
## 2)

In [22]:
# Reading the DataFrame
df_hs=pd.read_csv('Assignment Datasets/House prices.csv')
df_hs.sample(5)

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
2588,2014-06-16 00:00:00,565000.0,4.0,2.75,3130,139392,2.0,0,0,4,3130,0,1981,0,27705 114th Ave SE,Kent,WA 98030,USA
242,2014-05-07 00:00:00,758000.0,4.0,2.75,2410,9549,1.0,0,0,4,1780,630,1956,0,10627 NE 45th St,Kirkland,WA 98033,USA
171,2014-05-06 00:00:00,736000.0,4.0,2.5,2290,12047,2.0,0,0,4,2290,0,1988,0,17435 SE 47th St,Bellevue,WA 98006,USA
3469,2014-06-26 00:00:00,1340000.0,4.0,3.5,3190,5040,2.0,0,3,3,2160,1030,2003,0,1120 33rd Ave S,Seattle,WA 98144,USA
577,2014-05-13 00:00:00,464000.0,5.0,2.5,3400,8970,1.0,0,0,4,1700,1700,1959,0,14700 Burke Ave N,Shoreline,WA 98133,USA


In [14]:
df_hs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [24]:
# Extract months from the 'date' feature after converting its data type to datetime
df_hs.date=pd.to_datetime(df_hs.date, format='%Y-%m-%d %H:%M:%S')
df_hs['month']=df_hs.date.dt.month
df_hs.drop('date', 1, inplace=True)
df_hs.sample()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country,month
267,420000.0,3.0,2.25,1770,8165,2.0,0,0,3,1770,0,1977,2004,12921 79th Pl NE,Kirkland,WA 98034,USA,5


In [27]:
# Convert 'city' feature into dummy variables (numbers) for ML models
df_hs=pd.get_dummies(df_hs, columns=['city'], drop_first=True)
df_hs.sample()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,...,city_SeaTac,city_Seattle,city_Shoreline,city_Skykomish,city_Snoqualmie,city_Snoqualmie Pass,city_Tukwila,city_Vashon,city_Woodinville,city_Yarrow Point
2783,240000.0,2.0,1.0,670,10920,1.0,0,0,3,670,...,0,1,0,0,0,0,0,0,0,0


In [28]:
# Split independent data, and the dependent data to be predicted i.e. 'price'
df_hs.price=df_hs.price.apply(lambda x: round(x))

# columns to be ignored in our training set. All countries are USA, so feature shall be ignored
X=df_hs.drop(columns=['price', 'street', 'statezip', 'country'])
y=df_hs.price
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [29]:
# Scaling down data before entering the model
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [30]:
# Apply each model on our data, in addition, test each for accuracy on both train and test parts
for name, model in models.items():
    print(f'Using: {name}')
    model.fit(X_train, y_train)
    print(f'Train Accuracy: {model.score(X_train, y_train)}')
    print(f'Test Accuracy: {model.score(X_test, y_test)}')
    print('-'*30,'\n')

Using: LR
Train Accuracy: 0.23238842346114597
Test Accuracy: 0.6595482380842255
------------------------------ 

Using: KNN
Train Accuracy: 0.3867869078834223
Test Accuracy: 0.5255254658677782
------------------------------ 

Using: DT
Train Accuracy: 0.999998835228012
Test Accuracy: -5.721019081399469
------------------------------ 

Using: RF
Train Accuracy: 0.8801404337126194
Test Accuracy: 0.060589602390690445
------------------------------ 

Using: SVR
Train Accuracy: -0.02328941965769271
Test Accuracy: -0.057451042543622544
------------------------------ 

Using: XGBR
Train Accuracy: 0.8489726230095569
Test Accuracy: -0.7563987324891364
------------------------------ 



Based on above results, **Linear Regression** algorithm showed best testing accuracy.
<br /> <hr>
## 3)

In [31]:
df_dt=pd.read_csv('Assignment Datasets/data.csv')
df_dt.sample(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
387,1000770,P00139942,M,18-25,4,B,2,1,5,,,8653
10876,1002271,P00226242,M,51-55,14,A,1,0,11,15.0,,4451
13068,1002743,P00123142,M,26-35,14,B,2,0,11,,,4487
4912,1001422,P00185342,M,36-45,17,B,3,0,3,4.0,,8065
10143,1002161,P00117542,F,36-45,2,C,2,1,18,,,3137


In [32]:
df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     22894 non-null  int64  
 1   Product_ID                  22894 non-null  object 
 2   Gender                      22894 non-null  object 
 3   Age                         22894 non-null  object 
 4   Occupation                  22894 non-null  int64  
 5   City_Category               22894 non-null  object 
 6   Stay_In_Current_City_Years  22894 non-null  object 
 7   Marital_Status              22894 non-null  int64  
 8   Product_Category_1          22894 non-null  int64  
 9   Product_Category_2          12804 non-null  float64
 10  Product_Category_3          5576 non-null   float64
 11  Purchase                    22894 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 2.1+ MB


In [34]:
# Show values present in the object 'Age'
df_dt.Age.unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [36]:
# Show values present in the object 'Stay_In_Current_City_Years'
df_dt.Stay_In_Current_City_Years.unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

Cleaning data and Transforming features

In [38]:
# Convert 'Age' into numerical categorical ordinal data
df_dt.Age=df_dt.Age.map({'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6})

# Convert 'Stay_In_Current_City_Years' into numerical categorical ordinal data
df_dt.Stay_In_Current_City_Years.replace('4+', 4, True)
df_dt.Stay_In_Current_City_Years=pd.to_numeric(df_dt.Stay_In_Current_City_Years)

# It seems that products categories' columns represnt predictions of them, so we consider the first one only
df_dt.drop(columns=['Product_Category_2', 'Product_Category_3'], inplace=True)

# Convert categorical features into dummy variables (numbers) for ML models
df_dt=pd.get_dummies(df_dt, columns=['Gender', 'City_Category'], drop_first=True)
df_dt.sample()

Unnamed: 0,User_ID,Product_ID,Age,Occupation,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Purchase,Gender_M,City_Category_B,City_Category_C
10726,1002239,P00174242,1,10,0,0,15,21034,1,0,0


In [39]:
df_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22894 entries, 0 to 22893
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   User_ID                     22894 non-null  int64 
 1   Product_ID                  22894 non-null  object
 2   Age                         22894 non-null  int64 
 3   Occupation                  22894 non-null  int64 
 4   Stay_In_Current_City_Years  22894 non-null  int64 
 5   Marital_Status              22894 non-null  int64 
 6   Product_Category_1          22894 non-null  int64 
 7   Purchase                    22894 non-null  int64 
 8   Gender_M                    22894 non-null  uint8 
 9   City_Category_B             22894 non-null  uint8 
 10  City_Category_C             22894 non-null  uint8 
dtypes: int64(7), object(1), uint8(3)
memory usage: 1.5+ MB


In [40]:
# Split independent data, and the dependent data to be predicted i.e. 'price'
# columns to be ignored in our training set.
X=df_dt.drop(columns=['User_ID', 'Product_ID'])
y=df_dt.Purchase
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)

In [41]:
# Scaling down data before entering the model
scaler=StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

In [43]:
# Apply each model on our data, in addition, test each for accuracy on both train and test parts
for name, model in models.items():
    print(f'Using: {name}')
    model.fit(X_train, y_train)
    print(f'Train Accuracy: {model.score(X_train, y_train)}')
    print(f'Test Accuracy: {model.score(X_test, y_test)}')
    print('-'*30,'\n')

Using: LR
Train Accuracy: 1.0
Test Accuracy: 1.0
------------------------------ 

Using: KNN
Train Accuracy: 0.981204208117068
Test Accuracy: 0.9709175544666057
------------------------------ 

Using: DT
Train Accuracy: 1.0
Test Accuracy: 0.9999991826571308
------------------------------ 

Using: RF
Train Accuracy: 0.9999999080195007
Test Accuracy: 0.9999997137144145
------------------------------ 

Using: SVR
Train Accuracy: 0.19010894776419163
Test Accuracy: 0.18483291692839854
------------------------------ 

Using: XGBR
Train Accuracy: 0.9996988397964821
Test Accuracy: 0.9996918054205792
------------------------------ 



From above findings, we can conclude that **Random Forest Regressor** is the best testing accuracy