# Part I : Supervised Learning

## Portugal Bank Marketing Dataset

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
from warnings import filterwarnings
filterwarnings('ignore')

### Creating a dataframe and a copy to work on

In [2]:
Bank=pd.read_csv('bank.csv',delimiter=';')

In [3]:
bank=Bank.copy()

### Shape Check

In [4]:
bank.shape

(41188, 21)

In [5]:
bank.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Data Type Check

In [6]:
bank.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

### Missing Value Analysis

In [7]:
bank.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

No missing values

### Seperating into Numerical and Categorical Datasets

In [8]:
bank_categorical=bank.select_dtypes(include='object')
bank_categorical_cols=list(bank_categorical.columns)
bank_numerical=bank.select_dtypes(exclude='object')
bank_numerical_cols=list(bank_numerical.columns)

### Standardization

In [9]:
ss=StandardScaler()

In [10]:
bank_numerical=ss.fit_transform(bank_numerical)
bank_numerical=pd.DataFrame(bank_numerical,columns=bank_numerical_cols)

###  Label Encoding

In [11]:
le = LabelEncoder()

In [12]:
for i in bank_categorical_cols:
#     if i=='y':
#         continue
    bank_categorical[i]=le.fit_transform(bank_categorical[i])

### Combining the numerical and categorical 

In [13]:
bank_final=pd.concat([bank_categorical,bank_numerical],axis=1)

In [14]:
bank_final.columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome', 'y', 'age', 'duration', 'campaign',
       'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'euribor3m', 'nr.employed'],
      dtype='object')

### Train Test Split

In [35]:
#x=bank_final[['age', 'duration', 'euribor3m', 'nr.employed']]
x=bank_final.drop(['y'],axis=1)
y=bank_final.y
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

### Selecting Features using Random Forests

In [36]:
sel = SelectFromModel(RandomForestClassifier())
sel.fit(X_train, y_train)
selected_feat= X_train.columns[(sel.get_support())]
print(selected_feat)

Index(['age', 'duration', 'euribor3m', 'nr.employed'], dtype='object')


We can see that the important features are printed above using RandomForest Selection

### Defining a function that takes in a Model and gives out Accuracy

In [37]:
def getscore(model):
    model.fit(X_train,y_train)
    TrainingScore=model.score(X_train,y_train)
    TestingScore=model.score(X_test,y_test)
    print('Train Data Score : ',TrainingScore*100)
    print('Test Data Score : ',TestingScore*100)

### Logistic Regression

In [38]:
getscore(LogisticRegression())

Train Data Score :  91.02355103881239
Test Data Score :  91.06579266812334


### AdaBoost

In [33]:
getscore(AdaBoostClassifier())

Train Data Score :  90.11133848981999
Test Data Score :  90.28081249494214


### Naive Bayes

In [20]:
getscore(GaussianNB())

Train Data Score :  84.66234261732163
Test Data Score :  84.58363680504976


### KNN

In [21]:
getscore(KNeighborsClassifier())

Train Data Score :  92.7855433387673
Test Data Score :  89.52820263818079


### SVC

In [22]:
getscore(SVC())

Train Data Score :  91.14147965731331
Test Data Score :  90.75018208302986


We can see that except Naive Bayes the other methods seem to perform well. Logistic Regression seems to be doing better in my opinion since it has little difference between Train and Test Scores and the highest Test Score

# Part II : Time Series

## Indian Onion Market

In [63]:
Onion=pd.read_csv('MonthWiseMarketArrivals_Clean.csv',parse_dates=['date','year','month'],index_col=['date'])
onion=Onion.copy()

### Shape Check

In [64]:
onion.shape

(10227, 9)

### Dtype Check

In [65]:
onion.dtypes

market              object
month               object
year        datetime64[ns]
quantity             int64
priceMin             int64
priceMax             int64
priceMod             int64
state               object
city                object
dtype: object

### Missing Values Checks

In [66]:
onion.isnull().sum()

market      0
month       0
year        0
quantity    0
priceMin    0
priceMax    0
priceMod    0
state       0
city        0
dtype: int64

In [67]:
onion.head()

Unnamed: 0_level_0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2005-01-01,ABOHAR(PB),January,2005-01-01,2350,404,493,446,PB,ABOHAR
2006-01-01,ABOHAR(PB),January,2006-01-01,900,487,638,563,PB,ABOHAR
2010-01-01,ABOHAR(PB),January,2010-01-01,790,1283,1592,1460,PB,ABOHAR
2011-01-01,ABOHAR(PB),January,2011-01-01,245,3067,3750,3433,PB,ABOHAR
2012-01-01,ABOHAR(PB),January,2012-01-01,1035,523,686,605,PB,ABOHAR


In [83]:
onion=onion.sort_index()

### Modal price of onion for each month for the Mumbai

In [84]:
mop=onion.loc[onion['market']=='MUMBAI']

In [86]:
mop.head()

Unnamed: 0_level_0,market,month,year,quantity,priceMin,priceMax,priceMod,state,city
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01,MUMBAI,January,2004-01-01,267100,719,971,849,MS,MUMBAI
2004-02-01,MUMBAI,February,2004-01-01,239350,614,834,736,MS,MUMBAI
2004-03-01,MUMBAI,March,2004-01-01,238000,436,557,498,MS,MUMBAI
2004-04-01,MUMBAI,April,2004-01-01,198200,346,475,397,MS,MUMBAI
2004-05-01,MUMBAI,May,2004-01-01,166430,357,466,405,MS,MUMBAI


In [91]:
modprice=pd.DataFrame(mop['priceMod'])

In [92]:
modprice.head()

Unnamed: 0_level_0,priceMod
date,Unnamed: 1_level_1
2004-01-01,849
2004-02-01,736
2004-03-01,498
2004-04-01,397
2004-05-01,405


the variable modprice contains the modal price