# Execute the code below

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn.metrics import accuracy_score, r2_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

link_main = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_main_2018.csv"
link_opinion = "https://raw.githubusercontent.com/murpi/wilddata/master/quests/weather_opinion_2018.csv"
df_main = pd.read_csv(link_main)
df_opinion = pd.read_csv(link_opinion)
print(df_main.head())
print(df_opinion.head())

         DATE  MAX_TEMPERATURE_C  ...  DEWPOINT_MAX_C  WINDTEMP_MAX_C
0  2018-01-01                 12  ...               8               7
1  2018-01-02                 13  ...              12               6
2  2018-01-03                 15  ...              13               7
3  2018-01-04                 14  ...              12              10
4  2018-01-05                 12  ...              10               7

[5 rows x 15 columns]
         date  WEATHER_CODE_EVENING  TOTAL_SNOW_MM  UV_INDEX  SUNHOUR OPINION
0  2018-01-01                   113              0         3      5.1     bad
1  2018-03-12                   119              0         2      8.8     bad
2  2018-03-09                   116              0         3     10.2     bad
3  2018-10-07                   122              0         1      5.6     bad
4  2018-06-18                   119              0         1     12.9     bad


# Classification challenge

Your goal are :
- to merge both 2018 DataFrames
- to train-test split the new 2018 DataFrame
- to train 3 differents Machine Learning algorithms (KNN, logistic regression and decision tree) with "opinion" as target
- to try different parameters
- to find the best accuracy score (on the test set of course)
- to fill the missing values in the "opinion" columns whith your best model
- to explain what is the "rules" used by your model to predict the opinion.

You can help yourself with charts if you want.

In [None]:
def merge_and_fill(DataFrameMain, DataFrameOpinion):
  NewDataFrame = pd.merge(DataFrameMain, DataFrameOpinion, how='left', left_on='DATE', right_on='date')
  NewDataFrame['flag'] = NewDataFrame['OPINION'].isna()
  NewDataFrame['WEATHER_CODE_EVENING'].fillna(round(np.mean(NewDataFrame['WEATHER_CODE_EVENING'])), inplace=True)
  NewDataFrame['TOTAL_SNOW_MM'].fillna(round(np.mean(NewDataFrame['TOTAL_SNOW_MM'])), inplace = True)
  NewDataFrame['UV_INDEX'].fillna(round(np.mean(NewDataFrame['UV_INDEX'])), inplace = True)
  NewDataFrame['SUNHOUR'].fillna(method = 'bfill', inplace = True)
  #NewDataFrame['OPINION'].fillna('unknown', inplace = True)
  NewDataFrame.drop('date', axis=1, inplace=True)
  NewDataFrame['DATE'] = pd.to_datetime(NewDataFrame['DATE'])

  return NewDataFrame

In [None]:
df_total = merge_and_fill(df_main, df_opinion)
df_total

Unnamed: 0,DATE,MAX_TEMPERATURE_C,MIN_TEMPERATURE_C,WINDSPEED_MAX_KMH,TEMPERATURE_MORNING_C,TEMPERATURE_NOON_C,TEMPERATURE_EVENING_C,PRECIP_TOTAL_DAY_MM,HUMIDITY_MAX_PERCENT,VISIBILITY_AVG_KM,PRESSURE_MAX_MB,CLOUDCOVER_AVG_PERCENT,HEATINDEX_MAX_C,DEWPOINT_MAX_C,WINDTEMP_MAX_C,WEATHER_CODE_EVENING,TOTAL_SNOW_MM,UV_INDEX,SUNHOUR,OPINION,flag
0,2018-01-01,12,8,61,9,11,8,8.9,79,9.500,1018,41.750,12,8,7,113.0,0.0,3.0,5.1,bad,False
1,2018-01-02,13,6,26,8,12,13,0.6,96,9.000,1020,87.875,13,12,6,122.0,0.0,3.0,3.3,bad,False
2,2018-01-03,15,10,40,11,12,10,5.5,82,8.500,1017,91.500,15,13,7,122.0,0.0,3.0,3.3,bad,False
3,2018-01-04,14,11,45,14,14,11,0.0,89,10.000,1011,90.125,14,12,10,116.0,0.0,3.0,3.3,bad,False
4,2018-01-05,12,7,21,10,11,8,1.5,85,9.875,1005,62.375,12,10,7,116.0,0.0,3.0,6.9,bad,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,2018-12-27,7,2,6,2,6,5,0.0,85,10.000,1027,30.750,8,6,3,119.0,0.0,1.0,8.7,very bad,False
361,2018-12-28,7,2,8,2,7,3,0.0,89,8.000,1035,18.750,8,4,4,113.0,0.0,1.0,8.7,very bad,False
362,2018-12-29,7,1,6,1,6,4,0.0,94,7.000,1038,33.000,8,5,1,116.0,0.0,1.0,8.7,very bad,False
363,2018-12-30,9,4,6,5,9,8,0.1,95,6.000,1038,70.375,10,9,7,143.0,0.0,1.0,3.3,very bad,False


In [None]:
df_nan = df_total.loc[df_total['flag'] == True]
df_nan.shape

(24, 21)

In [None]:
df_clean = df_total.loc[df_total['flag'] == False]
df_clean.shape

(341, 21)

# KNN

In [None]:
X = df_clean[['MAX_TEMPERATURE_C', 'MIN_TEMPERATURE_C', 'WINDSPEED_MAX_KMH',
       'TEMPERATURE_MORNING_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_EVENING_C',
       'PRECIP_TOTAL_DAY_MM', 'HUMIDITY_MAX_PERCENT', 'VISIBILITY_AVG_KM',
       'PRESSURE_MAX_MB', 'CLOUDCOVER_AVG_PERCENT', 'HEATINDEX_MAX_C',
       'DEWPOINT_MAX_C', 'WINDTEMP_MAX_C', 'WEATHER_CODE_EVENING',
       'TOTAL_SNOW_MM', 'UV_INDEX', 'SUNHOUR']]
y = df_clean["OPINION"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75)

modelKNN = KNeighborsClassifier()
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = modelKNN.predict(X_test)),
             index = modelKNN.classes_ + " ACTUAL",
             columns = modelKNN.classes_ + " PREDICTED")

Unnamed: 0,bad PREDICTED,good PREDICTED,not good not bad PREDICTED,very bad PREDICTED,very good PREDICTED
bad ACTUAL,18,0,0,2,0
good ACTUAL,0,21,3,0,0
not good not bad ACTUAL,3,5,7,0,0
very bad ACTUAL,5,0,1,11,0
very good ACTUAL,0,8,0,0,2


In [None]:
print("accuracy score on train set:",modelKNN.score(X_train, y_train))
print("accuracy score on test set:",modelKNN.score(X_test, y_test))

accuracy score on train set: 0.8196078431372549
accuracy score on test set: 0.686046511627907


In [None]:
print(classification_report(y_test, modelKNN.predict(X_test)))

                  precision    recall  f1-score   support

             bad       0.69      0.90      0.78        20
            good       0.62      0.88      0.72        24
not good not bad       0.64      0.47      0.54        15
        very bad       0.85      0.65      0.73        17
       very good       1.00      0.20      0.33        10

        accuracy                           0.69        86
       macro avg       0.76      0.62      0.62        86
    weighted avg       0.73      0.69      0.66        86



In [None]:
bestModel = 0



for i in range(2,len(X_train)+1):
  uniformModel = KNeighborsClassifier(n_neighbors=i, weights = "uniform")
  uniformModel.fit(X_train, y_train)
  checkModelUniform = uniformModel.score(X_test, y_test)
  distanceModel = KNeighborsClassifier(n_neighbors=i, weights = "distance")
  distanceModel.fit(X_train, y_train)
  checkModelDistance = distanceModel.score(X_test, y_test)
  if checkModelUniform > bestModel:
    bestModel = checkModelUniform
    best_number_of_neighbors = i
    weights_check = "uniform"
  if checkModelDistance > bestModel: 
    bestModel = checkModelDistance
    best_number_of_neighbors = i
    weights_check = "distance"

print (f"The optinal values are {best_number_of_neighbors} number of neighbors using a {weights_check} weights. The test score is {round(bestModel, 4)}")

The optinal values are 10 number of neighbors using a distance weights. The test score is 0.7674


In [None]:
modelKNN = KNeighborsClassifier(n_neighbors=5, weights='distance')
modelKNN.fit(X_train, y_train)

print(f"Accuracy score on the train dataset: {modelKNN.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset: {modelKNN.score(X_test, y_test)}")

Accuracy score on the train dataset: 1.0
Accuracy score on the test dataset: 0.6976744186046512


# Logistic regression

In [None]:
model_logistic_regression = LogisticRegression(max_iter=1000)
model_logistic_regression.fit(X_train, y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
print(f"Accuracy score on the train dataset: {model_logistic_regression.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset: {model_logistic_regression.score(X_test, y_test)}")

Accuracy score on the train dataset: 0.9019607843137255
Accuracy score on the test dataset: 0.7558139534883721


In [None]:
model_logistic_regression = LogisticRegression(max_iter=5)
model_logistic_regression.fit(X_train, y_train)
print(f"Accuracy score on the train dataset with 5 iterations: {model_logistic_regression.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset with 5 iterations: {model_logistic_regression.score(X_test, y_test)}")
model_logistic_regression = LogisticRegression(max_iter=20)
model_logistic_regression.fit(X_train, y_train)
print(f"Accuracy score on the train dataset with 20 iterations: {model_logistic_regression.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset with 20 iterations: {model_logistic_regression.score(X_test, y_test)}")
model_logistic_regression = LogisticRegression(max_iter=100)
model_logistic_regression.fit(X_train, y_train)
print(f"Accuracy score on the train dataset with 100 iterations: {model_logistic_regression.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset with 100 iterations: {model_logistic_regression.score(X_test, y_test)}")
model_logistic_regression = LogisticRegression(max_iter=3000)
model_logistic_regression.fit(X_train, y_train)
print(f"Accuracy score on the train dataset with 3000 iterations: {model_logistic_regression.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset with 3000 iterations: {model_logistic_regression.score(X_test, y_test)}")


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

Accuracy score on the train dataset with 5 iterations: 0.2627450980392157
Accuracy score on the test dataset with 5 iterations: 0.27906976744186046
Accuracy score on the train dataset with 20 iterations: 0.7294117647058823
Accuracy score on the test dataset with 20 iterations: 0.6395348837209303
Accuracy score on the train dataset with 100 iterations: 0.8117647058823529
Accuracy score on the test dataset with 100 iterations: 0.7441860465116279
Accuracy score on the train dataset with 3000 iterations: 0.8941176470588236
Accuracy score on the test dataset with 3000 iterations: 0.7790697674418605



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = model_logistic_regression.predict(X_test)),
             index = model_logistic_regression.classes_ + " ACTUAL",
             columns = model_logistic_regression.classes_ + " PREDICTED")

Unnamed: 0,bad PREDICTED,good PREDICTED,not good not bad PREDICTED,very bad PREDICTED,very good PREDICTED
bad ACTUAL,16,0,3,1,0
good ACTUAL,0,19,4,0,1
not good not bad ACTUAL,2,4,8,1,0
very bad ACTUAL,0,0,0,17,0
very good ACTUAL,0,3,0,0,7


# Decision Tree

In [None]:
modelDTC = DecisionTreeClassifier()
modelDTC.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
print("accuracy score on train set:",modelDTC.score(X_train, y_train))
print("accuracy score on test set:",modelDTC.score(X_test, y_test))

accuracy score on train set: 1.0
accuracy score on test set: 0.813953488372093


In [None]:
print(classification_report(y_test, modelDTC.predict(X_test)))

                  precision    recall  f1-score   support

             bad       1.00      0.85      0.92        20
            good       0.72      0.75      0.73        24
not good not bad       0.64      0.93      0.76        15
        very bad       1.00      1.00      1.00        17
       very good       0.80      0.40      0.53        10

        accuracy                           0.81        86
       macro avg       0.83      0.79      0.79        86
    weighted avg       0.84      0.81      0.81        86



In [None]:
Another_Model = 0

for i in range(1,8):
  modelDTC = DecisionTreeClassifier(max_depth = i)
  modelDTC.fit(X_train, y_train)
  Another_Model_Trained = modelDTC.score(X_test, y_test)
  if Another_Model_Trained > Another_Model:
    Another_Model = Another_Model_Trained
    best_number_of_depths = i

print (f"The optinal value is {best_number_of_depths} number of depth. The test score is {Another_Model}")

The optinal value is 6 number of depth. The test score is 0.8837209302325582


In [None]:
modelDTC = DecisionTreeClassifier(max_depth=best_number_of_depths)
modelDTC.fit(X_train, y_train)

print(f"Accuracy score on the train dataset: {modelDTC.score(X_train, y_train)}")
print(f"Accuracy score on the test dataset: {modelDTC.score(X_test, y_test)}")

Accuracy score on the train dataset: 0.9529411764705882
Accuracy score on the test dataset: 0.8837209302325582


In [None]:
pd.DataFrame(data = confusion_matrix(y_true = y_test, y_pred = modelDTC.predict(X_test)),
             index = modelDTC.classes_ + " ACTUAL",
             columns = modelDTC.classes_ + " PREDICTED")

Unnamed: 0,bad PREDICTED,good PREDICTED,not good not bad PREDICTED,very bad PREDICTED,very good PREDICTED
bad ACTUAL,17,0,3,0,0
good ACTUAL,0,20,2,0,2
not good not bad ACTUAL,0,2,13,0,0
very bad ACTUAL,0,0,0,17,0
very good ACTUAL,0,1,0,0,9


# Prediction

The modelDTC got the highest accuracy score on the test dataset so I am going to use this one for the prediction.

In [None]:
cols = df_clean.columns[1:19]
cols

Index(['MAX_TEMPERATURE_C', 'MIN_TEMPERATURE_C', 'WINDSPEED_MAX_KMH',
       'TEMPERATURE_MORNING_C', 'TEMPERATURE_NOON_C', 'TEMPERATURE_EVENING_C',
       'PRECIP_TOTAL_DAY_MM', 'HUMIDITY_MAX_PERCENT', 'VISIBILITY_AVG_KM',
       'PRESSURE_MAX_MB', 'CLOUDCOVER_AVG_PERCENT', 'HEATINDEX_MAX_C',
       'DEWPOINT_MAX_C', 'WINDTEMP_MAX_C', 'WEATHER_CODE_EVENING',
       'TOTAL_SNOW_MM', 'UV_INDEX', 'SUNHOUR'],
      dtype='object')

In [None]:
modelDTC.predict(df_nan[cols])

array(['bad', 'very bad', 'not good not bad', 'not good not bad', 'bad',
       'good', 'good', 'good', 'not good not bad', 'good', 'good', 'good',
       'very good', 'good', 'very good', 'good', 'good', 'good',
       'not good not bad', 'not good not bad', 'very bad', 'very bad',
       'very bad', 'bad'], dtype=object)

In [None]:
df_nan['OPINION'] = modelDTC.predict(df_nan[cols])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
frames = [df_clean, df_nan]
table_finale = pd.concat(frames)

In [None]:
df_Opinion_Before_Prediction = pd.DataFrame()
df_Opinion_Before_Prediction['OPINION'] = df_total["OPINION"].value_counts()
df_Opinion_Before_Prediction.reset_index(inplace=True)
df_Opinion_Before_Prediction = df_Opinion_Before_Prediction.reindex([1, 2, 3, 0, 4])

df_Opinion_After_Prediction = pd.DataFrame()
df_Opinion_After_Prediction['OPINION'] = table_finale["OPINION"].value_counts()
df_Opinion_After_Prediction.reset_index(inplace=True)
df_Opinion_After_Prediction = df_Opinion_After_Prediction.reindex([1, 2, 3, 0, 4])

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Number of opinions before prediction", "Number of opinion after prediction"))

fig.add_trace(go.Bar(x = df_Opinion_Before_Prediction["index"], y=df_Opinion_Before_Prediction["OPINION"]),
              row=1, col=1)

fig.add_trace(go.Bar(x = df_Opinion_After_Prediction["index"], y=df_Opinion_After_Prediction["OPINION"]),
              row=1, col=2)

fig.update_layout(autosize=False, template='plotly_dark', width = 1500, height = 700, showlegend=False)

fig.update_xaxes(row=1, col=1)
fig.update_yaxes(title_text="", row=1, col=1, range=[0, 110])

fig.update_xaxes(row=1, col=2)
fig.update_yaxes(title_text="", row=1, col=2, range=[0, 110])


fig.show()