In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("dataset_mod_interm_csgo_round.parquet")

### Tratamento de dados nulos

In [4]:
# criar um dataframe paraa indentificar e tratar dados nulos

# encontra os valores nulos soma eles e cria um DataFrame com essas informação
# depois transforma o index em uma coluna individual com o reset_index()

df_nan = pd.DataFrame(df.isna().sum()).reset_index()
df_nan = df_nan.rename(columns= {0:'total_nan','index':'coluna'}) # renomeia as colunas para facilitar a interpretação

In [5]:
df_nan[df_nan.total_nan > 0] # busca os valores nulos, filtrando apenas o que é maior que 0

Unnamed: 0,coluna,total_nan


### Tratamento das variaveis

In [6]:
# identificar o tipo de variavel
variaveis = df.dtypes.reset_index().rename(columns= {0:'tipo','index':'coluna'})

# separar variaveis categoricas e numericas

variaveis_continuas = variaveis[variaveis.tipo != 'object'] # separar apenas variaveis NAO categoricas (not object)
variaveis_categoricas = variaveis[variaveis.tipo == 'object'] # separar apenas variaveis categoricas (object)

#### variaveis continuas

In [7]:
variaveis_continuas.tipo.unique()

array([dtype('float64'), dtype('bool')], dtype=object)

In [8]:
# variaveis booleanas precisam ser convertidas em numerica, ou seja, 0 ou 1

variaveis_continuas[variaveis_continuas.tipo == 'bool']

Unnamed: 0,coluna,tipo
4,bomb_planted,bool


In [9]:
# converte os valores na coluna bomb planted de True e False para 1 ou 0

df.bomb_planted = df.bomb_planted.astype(int)

### tratamento das variaveis categoricas

In [10]:
variaveis_categoricas # apenas map (nome do mapa) e round winner são variaveis categoricas

Unnamed: 0,coluna,tipo
3,map,object
96,round_winner,object


In [11]:
# importar bibliotecas necessarias

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
le = LabelEncoder() # estanciar o label encoder

# loop para iterar sobre cada coluna categorica i do dataframe (apenas as variaveis categoricas, delimitado pelo variaveis_categoricas.coluna) 
# e criar uma nova coluna de nome i_encoded que é i transformado por LabelEncoder

for i in variaveis_categoricas.coluna:
    df[str(i) + '_encoded'] = le.fit_transform(df[i])

In [13]:
df[['round_winner','round_winner_encoded','map','map_encoded']]

Unnamed: 0,round_winner,round_winner_encoded,map,map_encoded
0,CT,0,de_dust2,1
1,CT,0,de_dust2,1
2,CT,0,de_dust2,1
3,CT,0,de_dust2,1
4,CT,0,de_dust2,1
...,...,...,...,...
122405,T,1,de_train,6
122406,T,1,de_train,6
122407,T,1,de_train,6
122408,T,1,de_train,6


In [14]:
df = df.drop(columns=['round_winner','map']) # remover as variaveis categoricas pós conversão

In [15]:
x = df.drop("round_winner_encoded", axis=1) # separa todos os regressores
y = df.round_winner_encoded # isola a classe

#### O dataset tem valores em escalas muito diferentes, portanto é necessario padronizar essas escalas, para que nao interfira no desempenho do modelo

In [16]:
scaler = StandardScaler() # instanciar o padronizador de escala

x_scaled = scaler.fit_transform(x) # realizar a transformação

In [17]:
x_scaled # o transformador retorna um array, portanto é necessario transformar em um dataframe novamente

array([[ 1.41582771, -1.40057607, -1.40570151, ..., -0.16335636,
        -0.15685461, -1.30640985],
       [ 1.06753073, -1.40057607, -1.40570151, ..., -0.16335636,
        -0.15685461, -1.30640985],
       [-0.03409385, -1.40057607, -1.40570151, ..., -0.16335636,
        -0.15685461, -1.30640985],
       ...,
       [ 0.31291789,  0.89571055,  1.70405824, ..., -0.16335636,
        -0.15685461,  1.28549842],
       [-0.0542903 ,  0.89571055,  1.70405824, ..., -0.16335636,
        -0.15685461,  1.28549842],
       [-0.4214985 ,  0.89571055,  1.70405824, ..., -0.16335636,
        -0.15685461,  1.28549842]])

In [18]:
x_scaled = pd.DataFrame(x_scaled,columns=(x.columns))
x_scaled

Unnamed: 0,time_left,ct_score,t_score,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,...,t_grenade_flashbang,ct_grenade_smokegrenade,t_grenade_smokegrenade,ct_grenade_incendiarygrenade,t_grenade_incendiarygrenade,ct_grenade_molotovgrenade,t_grenade_molotovgrenade,ct_grenade_decoygrenade,t_grenade_decoygrenade,map_encoded
0,1.415828,-1.400576,-1.405702,-0.354764,0.664386,0.695301,-1.836777,-1.709542,-0.516186,-0.595345,...,-1.035461,-0.886648,-0.889569,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,-1.306410
1,1.067531,-1.400576,-1.405702,-0.354764,0.664386,0.695301,0.502008,0.008909,-0.819351,-0.870776,...,-1.035461,-0.886648,0.203841,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,-1.306410
2,-0.034094,-1.400576,-1.405702,-0.354764,-0.159544,-0.019401,-0.117770,-0.563908,-0.805977,-0.883109,...,-1.035461,-0.886648,0.203841,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,-1.306410
3,-0.401302,-1.400576,-1.405702,-0.354764,-0.159544,-0.019401,-0.117770,-0.563908,-0.805977,-0.883109,...,-1.035461,-0.886648,-0.889569,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,-1.306410
4,1.415277,-1.191823,-1.405702,-0.354764,0.664386,0.695301,-0.714160,-1.709542,0.763351,-0.040372,...,-1.035461,-0.886648,-0.889569,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,-1.306410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122405,-1.514310,0.895711,1.496741,2.818773,-1.603312,-1.148630,-0.696619,0.346871,-0.863935,-0.435020,...,0.079077,-0.311206,-0.342864,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,1.285498
122406,1.414542,0.895711,1.704058,-0.354764,0.664386,0.695301,-1.281315,-0.707112,0.152561,1.040797,...,0.079077,-0.311206,-0.889569,-0.687185,-0.137694,-0.210881,-0.812929,-0.163356,-0.156855,1.285498
122407,0.312918,0.895711,1.704058,-0.354764,0.664386,0.695301,1.057469,1.011339,-0.765852,-0.373356,...,1.193615,0.839676,1.843956,-0.001350,-0.137694,-0.210881,2.193253,-0.163356,-0.156855,1.285498
122408,-0.054290,0.895711,1.704058,-0.354764,0.664386,0.695301,1.057469,1.011339,-0.765852,-0.373356,...,1.750884,-0.886648,0.750546,-0.687185,-0.137694,-0.210881,1.592017,-0.163356,-0.156855,1.285498


#### separando os dados de treino e teste efetivamente

In [19]:
# separar o modelo em treino e teste, com 30% para treino

x_train, x_test, y_train, y_test = train_test_split(x_scaled,y,test_size=0.3,random_state=1)

### implementação do modelo

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
modelo = LogisticRegression() # instanciar o modelo

modelo.fit(x_train,y_train)

y_pred = modelo.predict(x_test)

In [22]:
y_pred

array([1, 0, 1, ..., 0, 1, 1])

### Avaliando o modelo

In [23]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix # as metricas mais comuns

In [24]:
print('acuracia do modelo foi de: ' + str((accuracy_score(y_test,y_pred)*100))+' %')
print('f1 score do modelo foi de: ' + str((f1_score(y_test,y_pred)*100))+' %')

cm = confusion_matrix(y_test,y_pred)

acuracia do modelo foi de: 75.0156577621654 %
f1 score do modelo foi de: 75.19666946013895 %


In [25]:
cm # plotando a matriz de confusão

array([[13640,  4389],
       [ 4786, 13908]], dtype=int64)

### Conclusão

A partir dos resultados da matriz de confusão e da acuracia e f1 em torno de 75%, podemos concluir que o modelo esta bem balanceado, sem viés e com boa precisao e recall