In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import seaborn as sns

base_url = r'/home/amit/DataScienceProject/PredictDroughts/'

Preparation to Machine Learning

In [None]:
df = pd.read_csv(base_url + 'files/merge.csv', index_col=0)
df

In [None]:
# Seperate date to year, month and day

df['Week'] = pd.to_datetime(df['Week'])
set_year = df['Week'].dt.year.to_list()
set_month = df['Week'].dt.month.to_list()
set_day = df['Week'].dt.day.to_list()
df.insert(0, "Year", set_year, None)
df.insert(1, "Month", set_month, None)
df.insert(2, "Day", set_day, None)
df = df.drop(columns='Week', axis=1)
df

In [None]:
# One hot encoding on level column

encoding_columns = pd.get_dummies(df['LEVEL'], prefix="level")
df = df.join(encoding_columns)
df

In [None]:
# Drop all the objects (string) type and Train X

X = df[df.columns[(df.columns != 'LEVEL') & (df.columns != 'State') & (df.columns != 'Postal Code') & (df.columns != 'Aland_SQMI')]]
X

In [None]:
y = df[['LEVEL']]
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% train, 30% test

In [None]:
score_list = []
models = ["KNeighborsClassifier", "LogisticRegression", "DecisionTreeClassifier", "SVC"]

Knn Model

In [None]:
# Check the accuracy on train and test set

k_s=[]
train_accuracies=[]
test_accuracies=[]
for k in range(1,21):
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train, y_train.values.ravel())
    y_pred_train=clf.predict(X_train)
    y_pred=clf.predict(X_test)
    k_s.append(k)
    train_accuracies.append(metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train))
    test_accuracies.append(metrics.accuracy_score(y_true=y_test, y_pred=y_pred))

df=pd.DataFrame({"k":k_s,"train_accuracy":train_accuracies,"test_accuracy":test_accuracies})
df

In [None]:
# cross validation - get the best score

parameters = {'n_neighbors':range(1,25,2) }
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters,scoring=make_scorer(metrics.accuracy_score, greater_is_better=True))
clf.fit(X_train, y_train.values.ravel())

print("best parameter set is:", clf.best_params_, " and its score was", clf.best_score_)
score_list.append(clf.best_score_ * 100)

Logistic Regression Model

In [None]:
lg = LogisticRegression().fit(X_train, y_train.values.ravel())
score = lg.score(X_test, y_test) * 100
print(score)
score_list.append(score)

Decision Tree Classifier Model

In [None]:
forest =  DecisionTreeClassifier(bootstrap=True, n_estimators=300, random_state=0)
trained_forest = forest.fit(X_train, y_train.values.ravel()) 

y_pred_train = trained_forest.predict(X_train)
y_pred = trained_forest.predict(X_test)
print('Accuracy on training data = ', metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train))
print('Accuracy on test data = ', metrics.accuracy_score(y_true=y_test, y_pred=y_pred))

In [None]:
parameters = {'max_depth':[2], "min_samples_split":[20]}
clf = GridSearchCV(forest, parameters,scoring=make_scorer(metrics.accuracy_score, greater_is_better=True))
clf.fit(X_train, y_train.values.ravel())
print("best parameter set is: ",clf.best_params_," and its score was ",clf.best_score_)
score_list.append(clf.best_score_ * 100)

In [None]:
SVM = SVC().fit(X_train, y_train)
SVM_score = SVM.score(X_test, y_test.values.ravel()) * 100
score_list.append(SVM_score)

y_pred_train = SVM.predict(X_train)
y_pred = SVM.predict(X_test)
print('Accuracy on training data = ', metrics.accuracy_score(y_true=y_train, y_pred=y_pred_train))
print('Accuracy on test data = ', metrics.accuracy_score(y_true=y_test, y_pred=y_pred))
print('Accuracy score = ', SVM_score)

In [None]:
data = {
    "Model": models,
    "Score": score_list
}

In [None]:
score_df = pd.DataFrame(data)
score_df

In [None]:
g = sns.barplot("Score", "Model", data=score_df, palette="Set3", orient = "h")
g = g.set_title("Cross validation scores")

We can see that Decision Tree classifier have 91.7% accurate More then every models that we used,
So for us Decision Tree classifier is the best model!