In [4]:
import pandas as pd
from datetime import datetime

## 💾 Load data

In [5]:
df = pd.read_json('../src/battle/infra/train-save.json')
df.to_csv('train-battles.csv', index=False)

df = pd.read_csv("train-battles.csv")

## 🛁 Clean data

In [6]:
selected_channels = ['Red Bull Batalla', 'Urban Roosters', 'GODLEVEL FEST', 'playz', 'Miraelbuenrap']
banned_channels = ['Ryker', 'Rodrigo Quesada', 'Blon Doblefilo']

all_channels = df['channel'].tolist()
new_channel_values = []

for chan in all_channels:
    if chan in selected_channels:
        new_channel_values.append(2)
    elif chan in banned_channels:
        new_channel_values.append(1)
    else:
        new_channel_values.append(0)

df['channel'] = new_channel_values

In [7]:
df = df.drop('title', axis=1)
df = df.drop('image', axis=1)
df = df.drop('rapper', axis=1)
df = df.drop('date', axis=1)

df

Unnamed: 0,channel,is_battle,duration,views
0,2,True,0:09:09,1508907
1,1,False,0:03:09,5615098
2,0,False,0:14:41,1450908
3,1,False,0:01:17,89841
4,2,True,0:14:16,1203206
...,...,...,...,...
1663,1,False,0:41:19,93489
1664,0,False,0:03:01,115520
1665,0,True,0:02:18,79467
1666,0,False,0:07:42,200913


In [8]:
df.loc[df['is_battle'] == True, 'is_battle'] = 1
df.loc[df['is_battle'] == False, 'is_battle'] = 0

df['is_battle']

  df.loc[df['is_battle'] == True, 'is_battle'] = 1


0       1
1       0
2       0
3       0
4       1
       ..
1663    0
1664    0
1665    1
1666    0
1667    0
Name: is_battle, Length: 1668, dtype: object

In [9]:
def convert_duration(duration) -> int:
    duration_datetime = datetime.strptime(duration, "%H:%M:%S")

    if duration_datetime.hour > 0 or duration_datetime.minute > 23:
        return 3 # large
    elif duration_datetime.minute <= 23 and duration_datetime.minute > 10:
        return 2 # medium
    
    elif duration_datetime.minute <= 10 and duration_datetime.minute > 5:
        return 1 # short
    else:
        return 0 # very short

all_durations = df['duration'].tolist()
new_durations = []

for dur in all_durations:
    new_durations.append(convert_duration(dur))

df['duration'] = new_durations


In [10]:
def convert_cout_visits(visits: str) -> int:
    num = int(visits)

    if num > 1000000:
        return 3 # very popular
    elif num <= 1000000 and num > 500000:
        return 2 # medium popular
    elif num <= 500000 and num > 100000:
        return 1 # little popular
    else:
        return 0 # unpopular
    
all_visits = df['views'].tolist()
new_visits = []

for views in all_visits:
    new_visits.append(convert_cout_visits(views))

df['views'] = new_visits

In [11]:
# Save in CSV file
df.to_csv('final-train-table.csv', index=False)

## 🌳 Tree creation

In [34]:
# Object column
y = df['is_battle'].astype('int64').copy()

y

0       1
1       0
2       0
3       0
4       1
       ..
1663    0
1664    0
1665    1
1666    0
1667    0
Name: is_battle, Length: 1668, dtype: int64

In [35]:
x = df[['channel', 'duration', 'views']].copy()

x

Unnamed: 0,channel,duration,views
0,2,1,3
1,1,0,3
2,0,2,3
3,1,0,0
4,2,2,3
...,...,...,...
1663,1,3,0
1664,0,0,1
1665,0,0,0
1666,0,1,1


In [36]:
from sklearn.model_selection import train_test_split

# create training sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=324)

print('X train\n', x_train, '\n')
print('X test\n', x_test, '\n')
print('Y train\n', y_train, '\n')
print('Y test\n', y_test, '\n')

X train
       channel  duration  views
1220        2         2      3
733         0         0      0
1210        0         3      1
1624        1         0      1
1028        0         0      1
...       ...       ...    ...
600         0         2      0
1531        1         2      1
908         0         1      3
756         0         1      3
665         0         1      0

[1117 rows x 3 columns] 

X test
       channel  duration  views
959         0         0      0
651         0         0      0
433         0         0      0
568         2         0      3
1569        0         0      1
...       ...       ...    ...
217         2         2      0
1089        0         1      2
1200        0         0      2
1138        0         3      0
806         0         0      0

[551 rows x 3 columns] 

Y train
 1220    1
733     0
1210    0
1624    0
1028    0
       ..
600     0
1531    0
908     0
756     0
665     0
Name: is_battle, Length: 1117, dtype: int64 

Y test
 959     0
651

In [37]:
from sklearn.tree import DecisionTreeClassifier

# create tree
battle_classifier = DecisionTreeClassifier(max_leaf_nodes=15, random_state=0)

# train
battle_classifier.fit(x_train, y_train)

In [41]:
from sklearn.metrics import accuracy_score

# predictions
predictions = battle_classifier.predict(x_test)

accuracy_score(y_true=y_test, y_pred=predictions)

0.8874773139745916

## 📦 Save decision tree

In [43]:
import joblib

joblib.dump(battle_classifier, 'tree_model.joblib')

['tree_model.joblib']