# Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as ltb
from sklearn.metrics import mean_squared_error,r2_score
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data

Unnamed: 0,Id,Open Date,City,City Group,Type,P1,P2,P3,P4,P5,...,P29,P30,P31,P32,P33,P34,P35,P36,P37,revenue
0,0,07/17/1999,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,...,3.0,5,3,4,5,5,4,3,4,5653753.0
1,1,02/14/2008,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,...,3.0,0,0,0,0,0,0,0,0,6923131.0
2,2,03/09/2013,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,2055379.0
3,3,02/02/2012,Tokat,Other,IL,6,4.5,6.0,6.0,4,...,7.5,25,12,10,6,18,12,12,6,2675511.0
4,4,05/09/2009,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,...,3.0,5,1,3,2,3,4,3,3,4316715.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,132,06/25/2008,Trabzon,Other,FC,2,3.0,3.0,5.0,4,...,3.0,0,0,0,0,0,0,0,0,5787594.0
133,133,10/12/2006,İzmir,Big Cities,FC,4,5.0,4.0,4.0,2,...,3.0,0,0,0,0,0,0,0,0,9262754.0
134,134,07/08/2006,Kayseri,Other,FC,3,4.0,4.0,4.0,2,...,3.0,0,0,0,0,0,0,0,0,2544857.0
135,135,10/29/2010,İstanbul,Big Cities,FC,4,5.0,4.0,5.0,2,...,3.0,0,0,0,0,0,0,0,0,7217634.0


In [None]:
data.isnull().sum()

In [None]:
data["City Group"].value_counts()

In [None]:
data["City"].value_counts()

In [None]:
data["Type"].value_counts()

In [None]:
data.drop(["Id"],axis = 1,inplace = True)

# Data Visualization:

In [None]:
def plot(data,feature,a,b):
    
    s1 = data[feature].value_counts()
    name = list(dict(s1).keys())
    val = list(dict(s1).values())
    
    plt.figure(figsize=(a, b))
    plt.title("Bar plot for {}".format(feature))
    plt.bar(name,val)
    plt.xlabel(feature, fontsize=18) # xlabel
    plt.ylabel('Count', fontsize=18)
    

In [None]:
plot(data,'City Group',10,6)

In [None]:
plot(data,'City',30,8)

In [None]:
plot(data,'Type',10,6)

# Working with datetime features:

In [None]:
data["Open Date"] = pd.to_datetime(data['Open Date']) 
data.sort_values(by = 'Open Date', inplace = True) # sorting the date in ascending order
    
data["Year"] = pd.DatetimeIndex(data['Open Date']).year # extracting year from date
data["Month"] = pd.DatetimeIndex(data['Open Date']).month # extracting month from date
data["Week"] = pd.DatetimeIndex(data['Open Date']).week # extracting week from date

df = data.reset_index().drop(["index"], axis=1)

In [None]:
df

In [None]:
plt.figure(figsize=(14, 5))
sns.lineplot(x="Open Date", y="revenue", data=df,palette="Dark2")

# Data Preprocessing:

In [None]:
x = df.drop(["Open Date","revenue"], axis = 1)
y = df["revenue"]

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1,random_state = 0) # splitting the data

In [None]:
x_train.shape,x_test.shape

In [None]:
y_train.shape,y_test.shape

# Cyclical Encoding:

In [None]:
x_train['month_sin'] = np.sin((x_train.Month -1)*(2.*np.pi/12))
x_train['month_cos'] = np.cos((x_train.Month -1)*(2.*np.pi/12))
x_train['weekday_sin'] = np.sin((x_train.Week-1)*(2.*np.pi/53))
x_train['weekday_cos'] = np.cos((x_train.Week-1)*(2.*np.pi/53))

In [None]:
# Adding the encoded values to test set.
x_test['month_sin'] = np.sin((x_test.Month -1)*(2.*np.pi/12))
x_test['month_cos'] = np.cos((x_test.Month -1)*(2.*np.pi/12))
x_test['weekday_sin'] = np.sin((x_test.Week-1)*(2.*np.pi/53))
x_test['weekday_cos'] = np.cos((x_test.Week-1)*(2.*np.pi/53))

# Cat Boost Encoding:

In [None]:
label = ["City","City Group","Type"]
cbe_encoder = ce.cat_boost.CatBoostEncoder()

In [None]:
x_train[label] = cbe_encoder.fit_transform(x_train[label],y_train)
x_test[label] = cbe_encoder.transform(x_test[label],y_test)

In [None]:
x_test.head(5)

# Final Dataframe:

In [None]:
training_data = x_train.drop(["Year","Month","Week"], axis = 1)
testing_data = x_test.drop(["Year","Month","Week"], axis = 1)

# Modeling:

# Linear Regression:

In [None]:
lr = LinearRegression()
lr.fit(training_data,y_train)

In [None]:
pred_lr = lr.predict(testing_data)

In [None]:
mse_test = mean_squared_error(y_test,pred_lr)

print("RMSE test :",math.sqrt(mse_test))

# Random Forest Regressor:

In [None]:
rf = RandomForestRegressor()
rf.fit(training_data,y_train)

In [None]:
pred_rf = rf.predict(testing_data)

In [None]:
mse_test = mean_squared_error(y_test,pred_rf)

print("RMSE test :",math.sqrt(mse_test))

# LGB Regressor:

In [None]:
lgb = ltb.LGBMRegressor()
lgb.fit(training_data, y_train)

In [None]:
pred_lgb = lgb.predict(testing_data)

In [None]:
mse_test = mean_squared_error(y_test,pred_lgb)

print("RMSE test :",math.sqrt(mse_test))

# Light Gradient Boosting gives lowest RMSE value