In [1]:
#Imports
import csv
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score,confusion_matrix
%matplotlib inline

#Leitura dos ficheiros
train = pd.read_csv('datasets/training_data.csv',encoding='cp1252')
test = pd.read_csv('datasets/test_data.csv',encoding='cp1252')

#Funções auxiliares
ordem = {'None':0, 'Low':1, 'Medium':2, 'High':3, 'Very_High':4}

def round_pred(pred,predictions):
    for n in pred:
        n = int(round(n))
        if n == 0: predictions.append('None')
        elif n == 1: predictions.append('Low')
        elif n == 2: predictions.append('Medium')
        elif n == 3: predictions.append('High')
        else: predictions.append('Very_High')
            
def hours(hour):
    if hour > 0 and hour <= 8: return 'Noite'
    elif hour > 8 and hour <= 16: return 'Hora_trabalho'
    else: return 'Final_dia'

In [2]:
#Tratamento de dados

#Feriados - Dias da semana - Estações
feriados = ['2018-01-01', '2018-02-13', '2018-03-30', '2018-04-01', '2018-04-25', '2018-05-01', '2018-05-31', 
            '2018-06-10', '2018-08-15', '2018-10-05', '2018-11-01', '2018-12-01', '2018-12-08', '2018-12-25',
            '2019-01-01', '2019-03-05', '2019-04-19', '2019-04-21', '2019-04-25', '2019-05-01', '2019-06-10', 
            '2019-06-20', '2019-08-15', '2019-10-05', '2019-11-01', '2019-12-01', '2019-12-08', '2019-12-25']

for index,row in train.iterrows():
    data_hour = row[1].split(' ')
    data = data_hour[0]
    hour_split = data_hour[1]
    hour = hour_split.split(':')[0]
    month = data.split('-')[1]
    train.at[index,'Parte_dia'] = hours(int(hour))
    if month in ['01','02','03']: train.at[index,'Estacao'] = 'Inverno'
    elif month in ['04','05','06']: train.at[index,'Estacao'] = 'Primavera'
    elif month in ['07','08','09']: train.at[index,'Estacao'] = 'Verao'
    else: train.at[index,'Estacao'] = 'Outono'
    dia = pd.Timestamp(data)
    train.at[index,'Dia'] = dia.day_name()
    if data in feriados: train.at[index,'Feriado'] = True
    else: train.at[index,'Feriado'] = False
        
for index,row in test.iterrows():
    data_hour = row[1].split(' ')
    data = data_hour[0]
    hour_split = data_hour[1]
    hour = hour_split.split(':')[0]
    month = data.split('-')[1]
    test.at[index,'Parte_dia'] = hours(int(hour))
    if month in ['01','02','03']: test.at[index,'Estacao'] = 'Inverno'
    elif month in ['04','05','06']: test.at[index,'Estacao'] = 'Primavera'
    elif month in ['07','08','09']: test.at[index,'Estacao'] = 'Verao'
    else: test.at[index,'Estacao'] = 'Outono'
    dia = pd.Timestamp(data)
    test.at[index,'Dia'] = dia.day_name()
    if data in feriados: test.at[index,'Feriado'] = True
    else: test.at[index,'Feriado'] = False
        
del train['AVERAGE_PRECIPITATION']
del test['AVERAGE_PRECIPITATION']
del train['city_name']
del test['city_name']
del train['record_date']
del test['record_date']
del train['AVERAGE_CLOUDINESS']
del test['AVERAGE_CLOUDINESS']
del train['AVERAGE_RAIN']
del test['AVERAGE_RAIN']
del train['AVERAGE_TEMPERATURE']
del test['AVERAGE_TEMPERATURE']
del train['AVERAGE_WIND_SPEED']
del test['AVERAGE_WIND_SPEED']
del train['AVERAGE_ATMOSP_PRESSURE']
del test['AVERAGE_ATMOSP_PRESSURE']
del train['AVERAGE_HUMIDITY']
del test['AVERAGE_HUMIDITY']

In [3]:
#Preparação para colocar nos modelos
X = train.drop(['AVERAGE_SPEED_DIFF'],axis=1)
Y = train['AVERAGE_SPEED_DIFF'].to_frame()
Y_2 = train['AVERAGE_SPEED_DIFF'].map(ordem)
X_teste = test
#Y_test = test['AVERAGE_SPEED_DIFF'].to_frame()

luminosity = LabelEncoder()
feriado = LabelEncoder()
day = LabelEncoder()
season = LabelEncoder()
hour = LabelEncoder()
luminosity_t = LabelEncoder()
feriado_t = LabelEncoder()
day_t = LabelEncoder()
season_t = LabelEncoder()
hour_t = LabelEncoder()

X['LUMINOSITY_n'] = luminosity.fit_transform(X['LUMINOSITY'])
X['Feriado_n'] = feriado.fit_transform(X['Feriado'])
X['Dia_n'] = day.fit_transform(X['Dia'])
X['Estacao_n'] = season.fit_transform(X['Estacao'])
X['Parte_dia_n'] = hour.fit_transform(X['Parte_dia'])

X_teste['LUMINOSITY_n'] = luminosity_t.fit_transform(X_teste['LUMINOSITY'])
X_teste['Feriado_n'] = feriado_t.fit_transform(X_teste['Feriado'])
X_teste['Dia_n'] = day_t.fit_transform(X_teste['Dia'])
X_teste['Estacao_n'] = season_t.fit_transform(X_teste['Estacao'])
X_teste['Parte_dia_n'] = hour_t.fit_transform(X_teste['Parte_dia'])

X = X.drop(['LUMINOSITY','Dia','Estacao','Parte_dia','Feriado'],axis=1)
X_teste = X_teste.drop(['LUMINOSITY', 'Dia','Estacao','Parte_dia','Feriado'],axis=1)

In [6]:
#DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state= 2021)
dtc.fit(X,Y)
predictions = dtc.predict(X_teste)
predictions

array(['None', 'Medium', 'None', ..., 'None', 'Very_High', 'Low'],
      dtype=object)

In [7]:
#Linear Regression
lr = LinearRegression()
lr.fit(X,Y_2)
pred = lr.predict(X_teste)
predictions = []
round_pred(pred,predictions)
predictions

['None',
 'Low',
 'None',
 'High',
 'Low',
 'Medium',
 'Low',
 'Low',
 'Low',
 'Medium',
 'None',
 'Low',
 'Medium',
 'High',
 'High',
 'High',
 'None',
 'High',
 'Low',
 'None',
 'None',
 'Low',
 'Low',
 'None',
 'Medium',
 'Medium',
 'Medium',
 'None',
 'None',
 'Very_High',
 'Low',
 'Low',
 'Very_High',
 'Low',
 'Low',
 'Low',
 'Low',
 'High',
 'Medium',
 'None',
 'Low',
 'None',
 'High',
 'Very_High',
 'High',
 'Low',
 'Low',
 'Low',
 'Low',
 'Low',
 'Low',
 'Low',
 'Very_High',
 'Medium',
 'None',
 'Medium',
 'High',
 'Very_High',
 'Very_High',
 'None',
 'None',
 'Low',
 'Low',
 'Low',
 'Very_High',
 'None',
 'Very_High',
 'Low',
 'Medium',
 'Very_High',
 'None',
 'Low',
 'Low',
 'Low',
 'Very_High',
 'High',
 'Low',
 'None',
 'Medium',
 'Medium',
 'Low',
 'Low',
 'Very_High',
 'None',
 'None',
 'Low',
 'None',
 'Medium',
 'None',
 'High',
 'Medium',
 'None',
 'None',
 'High',
 'Medium',
 'None',
 'Very_High',
 'None',
 'None',
 'Very_High',
 'Medium',
 'Low',
 'None',
 'None',
 '

In [11]:
#Logistic Regression
logr = LogisticRegression(max_iter=10000)
#X_train,X_test,y_train,y_test = train_test_split(X,Y_2, test_size=0.2, random_state=2021)
#logr.fit(X_train,y_train)
pred = logr.predict(X_teste)
predictions = []
round_pred(pred,predictions)
predictions
#accuracy_score(y_test, pred)

0.7681584739545121

In [16]:
#LinearSVC
model = LinearSVC(dual=False)
model.fit(X,Y)

predictions = model.predict(X_teste)
predictions

  y = column_or_1d(y, warn=True)


array(['None', 'Low', 'None', ..., 'None', 'Very_High', 'Medium'],
      dtype=object)

In [4]:
#GridSearch
param_grid = {'C': [1000], 'gamma': [0.0001],'kernel': ['rbf']}
grid = GridSearchCV(SVC(random_state=2021),param_grid,refit=True,verbose=3)

#X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=2021)
grid.fit(X,Y)
#grid.fit(X_train,y_train)

predictions = grid.predict(X_teste)
#predictions = grid.predict(X_test)
#accuracy_score(y_test, predictions)
predictions

Fitting 5 folds for each of 1 candidates, totalling 5 fits


  y = column_or_1d(y, warn=True)


[CV 1/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.769 total time=   1.6s


  y = column_or_1d(y, warn=True)


[CV 2/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.796 total time=   1.6s


  y = column_or_1d(y, warn=True)


[CV 3/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.795 total time=   1.5s


  y = column_or_1d(y, warn=True)


[CV 4/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.786 total time=   1.4s


  y = column_or_1d(y, warn=True)


[CV 5/5] END ..C=1000, gamma=0.0001, kernel=rbf;, score=0.782 total time=   1.1s


  y = column_or_1d(y, warn=True)


array(['None', 'Medium', 'None', ..., 'None', 'Very_High', 'Low'],
      dtype=object)

In [5]:
submission = open("submissions/submission23.csv", "w", newline='')
writer = csv.writer(submission)
writer.writerow(['RowId', 'Speed_Diff'])
rowId = list(range(0, 1500))
for i in range(1500):
    writer.writerow([rowId[i]+1, predictions[i]])

submission.close()