In [3]:
#Imports
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix
%matplotlib inline

#Leitura dos ficheiros
train = pd.read_csv('datasets/training_data.csv',encoding='cp1252')
test = pd.read_csv('datasets/test_data.csv',encoding='cp1252')

#Funções auxiliares
ordem = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very_High':4}

def round_pred(pred,predictions):
    for n in pred:
        n = int(round(n))
        if n == 0: predictions.append('None')
        elif n == 1: predictions.append('Low')
        elif n == 2: predictions.append('Medium')
        elif n == 3: predictions.append('High')
        else: predictions.append('Very_High')
            
def hours(hour):
    if hour > 0 and hour <= 8: return 'Noite'
    elif hour > 8 and hour <= 16: return 'Hora_trabalho'
    else: return 'Final_dia'

In [4]:
#Tratamento de dados

#Feriados - Dias da semana - Estações
feriados = ['2018-01-01', '2018-02-13', '2018-03-30', '2018-04-01', '2018-04-25', 
            '2018-05-01', '2018-05-31', '2018-06-10', '2018-08-15', '2018-10-05', 
            '2018-11-01', '2018-12-01', '2018-12-08', '2018-12-25', '2019-01-01',
            '2019-03-05', '2019-04-19', '2019-04-21', '2019-04-25', '2019-05-01',
            '2019-06-10', '2019-06-20', '2019-08-15', '2019-10-05', '2019-11-01', 
            '2019-12-01', '2019-12-08', '2019-12-25']

for index,row in train.iterrows():
    data_hour = row[1].split(' ')
    data = data_hour[0]
    hour_split = data_hour[1]
    hour = hour_split.split(':')[0]
    month = data.split('-')[1]
    train.at[index,'Parte_dia'] = hours(int(hour))
    if month in ['01','02','03']: train.at[index,'Estacao'] = 'Inverno'
    elif month in ['04','05','06']: train.at[index,'Estacao'] = 'Primavera'
    elif month in ['07','08','09']: train.at[index,'Estacao'] = 'Verao'
    else: train.at[index,'Estacao'] = 'Outono'
    dia = pd.Timestamp(data)
    train.at[index,'Dia'] = dia.day_name()
    if data in feriados: train.at[index,'Feriado'] = True
    else: train.at[index,'Feriado'] = False
        
for index,row in test.iterrows():
    data_hour = row[1].split(' ')
    data = data_hour[0]
    hour_split = data_hour[1]
    hour = hour_split.split(':')[0]
    month = data.split('-')[1]
    test.at[index,'Parte_dia'] = hours(int(hour))
    if month in ['01','02','03']: test.at[index,'Estacao'] = 'Inverno'
    elif month in ['04','05','06']: test.at[index,'Estacao'] = 'Primavera'
    elif month in ['07','08','09']: test.at[index,'Estacao'] = 'Verao'
    else: test.at[index,'Estacao'] = 'Outono'
    dia = pd.Timestamp(data)
    test.at[index,'Dia'] = dia.day_name()
    if data in feriados: test.at[index,'Feriado'] = True
    else: test.at[index,'Feriado'] = False
        

del train['city_name']
del test['city_name']
del train['record_date']
del test['record_date']
del train['AVERAGE_TEMPERATURE']
del test['AVERAGE_TEMPERATURE']
del train['AVERAGE_ATMOSP_PRESSURE']
del test['AVERAGE_ATMOSP_PRESSURE']
del train['AVERAGE_HUMIDITY']
del test['AVERAGE_HUMIDITY']
del train['AVERAGE_WIND_SPEED']
del test['AVERAGE_WIND_SPEED']
del train['AVERAGE_CLOUDINESS']
del test['AVERAGE_CLOUDINESS']
del train['AVERAGE_PRECIPITATION']
del test['AVERAGE_PRECIPITATION']
del train['AVERAGE_RAIN']
del test['AVERAGE_RAIN']

In [5]:
#Preparação para colocar nos modelos
X = train.drop(['AVERAGE_SPEED_DIFF'],axis=1) # Dataset de treino
Y = train['AVERAGE_SPEED_DIFF'].to_frame()    #        ""
Y_2 = train['AVERAGE_SPEED_DIFF'].map(ordem)  #        ""
X_teste = test # Dataset de teste

luminosity = LabelEncoder()
feriado = LabelEncoder()
day = LabelEncoder()
season = LabelEncoder()
hour = LabelEncoder()

X['LUMINOSITY_n'] = luminosity.fit_transform(X['LUMINOSITY'])
X['Feriado_n'] = feriado.fit_transform(X['Feriado'])
X['Dia_n'] = day.fit_transform(X['Dia'])
X['Estacao_n'] = season.fit_transform(X['Estacao'])
X['Parte_dia_n'] = hour.fit_transform(X['Parte_dia'])

X = X.drop(['LUMINOSITY','Dia','Estacao','Parte_dia','Feriado'],axis=1)

luminosity_t = LabelEncoder()
feriado_t = LabelEncoder()
day_t = LabelEncoder()
season_t = LabelEncoder()
hour_t = LabelEncoder()

X_teste['LUMINOSITY_n'] = luminosity_t.fit_transform(X_teste['LUMINOSITY'])
X_teste['Feriado_n'] = feriado_t.fit_transform(X_teste['Feriado'])
X_teste['Dia_n'] = day_t.fit_transform(X_teste['Dia'])
X_teste['Estacao_n'] = season_t.fit_transform(X_teste['Estacao'])
X_teste['Parte_dia_n'] = hour_t.fit_transform(X_teste['Parte_dia'])


X_teste = X_teste.drop(['LUMINOSITY', 'Dia','Estacao','Parte_dia','Feriado'],axis=1)

In [9]:
#Linear Regression
lr = LinearRegression()

#Sem dataset de teste
X_train,X_test,y_train,y_test = train_test_split(X,Y_2, test_size=0.3, random_state=2021)
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print('MAE: ', metrics.mean_absolute_error(y_test,pred))
print('MSE: ', metrics.mean_squared_error(y_test,pred))
print('RMSE: ', np.sqrt(metrics.mean_squared_error(y_test,pred)))

#Com datasat de teste
#lr.fit(X,Y_2)
#pred = lr.predict(X_teste)
#predictions = []
#round_pred(pred,predictions)

#predictions

MAE:  0.5511569522073306
MSE:  0.46447359920153003
RMSE:  0.6815229997597514


In [28]:
#logistic Regression
logr = LogisticRegression(max_iter=10000)

#Sem dataset de teste
X_train,X_test,y_train,y_test = train_test_split(X,np.ravel(Y), test_size=0.3, random_state=2021)
logr.fit(X_train,y_train)
predictions = logr.predict(X_test)
accuracy_score(y_test, predictions)

#Com datasat de teste
#logr.fit(X,np.ravel(Y))
#predictions = logr.predict(X_teste)

#predictions

0.7651663405088063

In [16]:
#DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 2021)

#Sem dataset de teste
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.3, random_state=2021)
dtc.fit(X_train,y_train)
predictions = dtc.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

#Com datasat de teste
#dtc.fit(X,Y)
#predictions = dtc.predict(X_teste)

#predictions

[[234   3  66   1  30]
 [  7 241  63  81   0]
 [ 72  84 326   7   1]
 [  1 107   4 562   0]
 [ 40   0   1   0 113]]
0.7221135029354208


In [23]:
#SVC
svc = SVC(random_state=2021)

#Sem dataset de teste
X_train,X_test,y_train,y_test = train_test_split(X,np.ravel(Y), test_size=0.3, random_state=2021)
svc.fit(X_train,y_train)
predictions = svc.predict(X_test)
print(accuracy_score(y_test, predictions))

#Com datasat de teste
#svc.fit(X,np.ravel(Y))
#predictions = svc.predict(X_teste)

#predictions

0.7597847358121331


In [22]:
#GridSearch
param_grid = {'C': [1000], 'gamma': [0.0001],'kernel': ['rbf']}
grid = GridSearchCV(SVC(random_state=2021),param_grid,refit=True,verbose=3)

#Sem dataset de teste
X_train,X_test,y_train,y_test = train_test_split(X,np.ravel(Y), test_size=0.3, random_state=2021)
grid.fit(X_train,y_train)
predictions = grid.predict(X_test)
print(accuracy_score(y_test, predictions))

#Com datasat de teste
#grid.fit(X,Y)
#predictions = grid.predict(X_teste)

#predictions

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.772 total time=   0.8s
[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.782 total time=   0.7s
[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.779 total time=   0.8s
[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.796 total time=   0.9s
[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.778 total time=   0.9s
0.7865003668378576


In [5]:
submission = open("submissions/submission23.csv", "w", newline='')
writer = csv.writer(submission)
writer.writerow(['RowId', 'Speed_Diff'])
rowId = list(range(0, 1500))
for i in range(1500):
    writer.writerow([rowId[i]+1, predictions[i]])

submission.close()