In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
plt.style.use('ggplot')

In [3]:
df = pd.read_excel('raw_data_with_label.xlsx', sheet_name='links', index_col=0)
df = df[df['y'].notnull()]
df.dropna(inplace=True)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
df_limpo = pd.DataFrame(index=df.index)
df_limpo['title']  = df['title']
df_limpo['views'] = df['view_counts'].astype(int)

## 1. Limpeza da data

In [6]:
clean_date = df['video_date'].str.extract(r'(\d+) de (\w+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+ x[0] if len(x[0]) == 1 else x)

mapa_meses = {
    'jan': 'Jan',
    'fev': 'Feb',
    'mar': 'Mar',
    'abr': 'Apr',
    'mai': 'May',
    'jun': 'Jun',
    'jul': 'Jul',
    'ago': 'Aug',
    'set': 'Sep',
    'out': 'Oct',
    'nov': 'Nov',
    'dez': 'Dec'
}

clean_date[1] = clean_date[1].map(mapa_meses)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
clean_date.head()
df_limpo['date'] = pd.to_datetime(clean_date, format='%d %b %Y')

In [7]:
indexs = df_limpo['date'].loc[df_limpo['date'].dt.year < 2018].index.values

df_limpo.drop(index=indexs, axis='rows', inplace=True)
df.drop(index=indexs, axis='rows', inplace=True)

In [8]:
print(df_limpo.shape, df.shape)

(370, 3) (370, 9)


## 2. Features

In [9]:
features = pd.DataFrame(index=df_limpo.index)
y = df['y'].copy()

features['tempo_desde_pub'] = (pd.to_datetime('2021-03-18') - df_limpo['date']) / np.timedelta64(1, 'D')
features['views'] = df_limpo['views']
features['views_por_dia'] = features['views'] / features['tempo_desde_pub']

In [10]:
qtd = len(df_limpo['date'])
meio = qtd // 2 if qtd % 2 == 0 else (qtd // 2) + 1
date = str(pd.Timestamp(df_limpo.sort_values(by='date')['date'].values[meio]).date())

In [11]:
mask_train = df_limpo['date'] < date
mask_val = df_limpo['date'] >= date

X_train, X_val = features[mask_train], features[mask_val]
y_train, y_val = y[mask_train], y[mask_val]
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(185, 3) (185,) (185, 3) (185,)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = df_limpo[mask_train]['title']
title_val = df_limpo[mask_val]['title']

title_vec = TfidfVectorizer(min_df=2) # numero de minimo que uma palavra aparece nos dados para poder virar uma coluna
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)

In [13]:
title_bow_train.shape

(185, 158)

In [14]:
title_bow_train

<185x158 sparse matrix of type '<class 'numpy.float64'>'
	with 1065 stored elements in Compressed Sparse Row format>

In [15]:
1 - 4169/(645*478) # 98 % do dados dessa matriz são valores 0 

0.9864778956245337

In [16]:
# hstack - [1 2]  [3 4] -> [1 2 3 4] - 1x4
# vstach - [1 2]  [3 4] -> [1 2] - 2x2
#                          [3 4]

In [17]:
from scipy.sparse import hstack, vstack

X_train_wtitle = hstack([X_train, title_bow_train])
X_val_wtitle = hstack([X_val, title_bow_val])

In [18]:
print(X_train_wtitle.shape, X_val_wtitle.shape)

(185, 161) (185, 161)


In [19]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=6)
mdl.fit(X_train_wtitle, y_train)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [20]:
p = mdl.predict_proba(X_val_wtitle)[:,1]

In [21]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [22]:
average_precision_score(y_val, p)

0.3317998943372271

In [23]:
roc_auc_score(y_val, p)

0.6730984340044742

## 3. Active Learning

70 exemplos que o modelo tenha dificuldade

30 exemplos aleatoriamente

In [24]:
df_unlabeled = pd.read_excel('raw_data_with_label.xlsx', sheet_name='links', index_col=0)

columns = list(df_unlabeled.columns)
columns.remove('y')

df_unlabeled = df_unlabeled[df_unlabeled['y'].isnull()].dropna(subset=columns, how='any')

In [25]:
df_unlabeled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1035 entries, 501 to 1766
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   link         1035 non-null   object 
 1   title        1035 non-null   object 
 2   canal_nome   1035 non-null   object 
 3   canal_link   1035 non-null   object 
 4   view_counts  1035 non-null   float64
 5   video_date   1035 non-null   object 
 6   like         1035 non-null   object 
 7   dislike      1035 non-null   object 
 8   y            0 non-null      float64
dtypes: float64(2), object(7)
memory usage: 80.9+ KB


In [26]:
df_unlabeled.head(1)

Unnamed: 0,link,title,canal_nome,canal_link,view_counts,video_date,like,dislike,y
501,https://www.youtube.com/watch?v=GXWBzcAzjoQ,"Kaggle - Titanic Solution [3/3] - Classifier, ...",Minsuk Heo 허민석,https://www.youtube.com/channel/UCxP77kNgVfiiG...,23703.0,29 de out. de 2017,"451 marcações ""Gostei""","6 marcações ""Não gostei""",


In [27]:
df_limpo_u = pd.DataFrame(index=df_unlabeled.index)
df_limpo_u['title'] = df_unlabeled['title']
df_limpo_u['views'] = df_unlabeled['view_counts'].astype(int)

In [28]:
clean_date = df_unlabeled['video_date'].str.extract(r'(\d+) de (\w+)\. de (\d+)')
clean_date[0] = clean_date[0].map(lambda x: "0"+ x[0] if len(x[0]) == 1 else x)

mapa_meses = {
    'jan': 'Jan',
    'fev': 'Feb',
    'mar': 'Mar',
    'abr': 'Apr',
    'mai': 'May',
    'jun': 'Jun',
    'jul': 'Jul',
    'ago': 'Aug',
    'set': 'Sep',
    'out': 'Oct',
    'nov': 'Nov',
    'dez': 'Dec'
}

clean_date[1] = clean_date[1].map(mapa_meses)
clean_date = clean_date.apply(lambda x: ' '.join(x), axis=1)
clean_date.head()
df_limpo_u['date'] = pd.to_datetime(clean_date, format='%d %b %Y')

In [29]:
indexs = df_limpo_u['date'].loc[df_limpo_u['date'].dt.year < 2018].index.values

df_limpo_u.drop(index=indexs, axis='rows', inplace=True)
df_unlabeled.drop(index=indexs, axis='rows', inplace=True)

In [30]:
df_limpo_u

Unnamed: 0,title,views,date
503,Data Analytics for Beginners | Google Data Ana...,119917,2021-02-02
505,BCG Gamma: data science and advanced analytics...,4051,2020-04-01
506,Learn Machine Learning and Artificial Intellig...,36457,2020-06-01
507,Data Science Project from Scratch - Part 2 (Da...,42237,2020-04-06
508,Paris Demoday | Data Science batch #359,1719,2020-03-01
...,...,...,...
1759,MIT 6.S091: Introduction to Deep Reinforcement...,157292,2019-01-02
1760,"Πως να Ξεκινήσω με Machine Learning, NerdCast",4818,2020-09-03
1762,Math Needed for Mastering Data Science,36019,2019-12-09
1765,HIN Webcast: The Revolution of Tomorrow - Arti...,527,2021-03-01


In [31]:
df_limpo_u.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 937 entries, 503 to 1766
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   title   937 non-null    object        
 1   views   937 non-null    int32         
 2   date    937 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int32(1), object(1)
memory usage: 25.6+ KB


In [32]:
df_unlabeled.shape

(937, 9)

In [33]:
features_u = pd.DataFrame(index=df_limpo_u.index)

features_u['tempo_desde_pub'] = (pd.to_datetime('2021-03-18') - df_limpo_u['date']) / np.timedelta64(1, 'D')
features_u['views'] = df_limpo_u['views']
features_u['views_por_dia'] = features_u['views'] / features_u['tempo_desde_pub']

In [34]:
features_u.head(1)

Unnamed: 0,tempo_desde_pub,views,views_por_dia
503,44.0,119917,2725.386364


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_u = df_limpo_u['title']
title_bow_u = title_vec.transform(title_u)

In [36]:
title_bow_u

<937x158 sparse matrix of type '<class 'numpy.float64'>'
	with 4502 stored elements in Compressed Sparse Row format>

In [37]:
Xu_wtitle = hstack([features_u, title_bow_u])

In [38]:
Xu_wtitle

<937x161 sparse matrix of type '<class 'numpy.float64'>'
	with 7307 stored elements in COOrdinate format>

In [39]:
pu = mdl.predict_proba(Xu_wtitle)[:,1]

In [40]:
df_unlabeled['p'] = pu

In [42]:
df_unlabeled.head(1)

Unnamed: 0,link,title,canal_nome,canal_link,view_counts,video_date,like,dislike,y,p
503,https://www.youtube.com/watch?v=GbL-42kv5LI,Data Analytics for Beginners | Google Data Ana...,#GrowWithGoogle,https://www.youtube.com/hashtag/growwithgoogle,119917.0,25 de fev. de 2021,"4.280 marcações ""Gostei""","33 marcações ""Não gostei""",,0.151


### Exemplos que ficaram no limiar do modelo com dificuldade de classificação

In [49]:
mask_u = (df_unlabeled['p'] >= 0.45) & (df_unlabeled['p'] <= 0.55)
mask_u.sum()

13

In [48]:
df_unlabeled[mask_u].sort_values('p')

Unnamed: 0,link,title,canal_nome,canal_link,view_counts,video_date,like,dislike,y,p
1598,https://www.youtube.com/watch?v=tXVNS-V39A0,TensorFlow In 10 Minutes | TensorFlow Tutorial...,edureka!,https://www.youtube.com/channel/UCkw4JCwteGrDH...,229057.0,6 de mai. de 2019,"2.860 marcações ""Gostei""","548 marcações ""Não gostei""",,0.45
1138,https://www.youtube.com/watch?v=da9Q3xoHOxw,"Yumeng Ding (Cornell MFE '20) - ""Interpreting ...",Cornell Financial Engineering Manhattan CFEM,https://www.youtube.com/channel/UCc5EkCmdbJSiP...,29.0,12 de mar. de 2021,"1 marcação ""Gostei""","1 marcação ""Não gostei""",,0.455
599,https://www.youtube.com/watch?v=JgvyzIkgxF0,An introduction to Reinforcement Learning,Arxiv Insights,https://www.youtube.com/channel/UCNIkB2IeJ-6Am...,349704.0,2 de abr. de 2018,"10.691 marcações ""Gostei""","140 marcações ""Não gostei""",,0.479
1145,https://www.youtube.com/watch?v=dl_ZsuHSIFE,SVM Kernal- Polynomial And RBF Implementation ...,Krish Naik,https://www.youtube.com/channel/UCNU_lfiiWBdtU...,4953.0,10 de mar. de 2021,"143 marcações ""Gostei""","3 marcações ""Não gostei""",,0.481
595,https://www.youtube.com/watch?v=J_LnPL3Qg70,Machine Learning Tutorial Python - 3: Linear R...,#MachineLearning,https://www.youtube.com/hashtag/machinelearning,199229.0,4 de jul. de 2018,"2.587 marcações ""Gostei""","103 marcações ""Não gostei""",,0.484
704,https://www.youtube.com/watch?v=Ny82iVL6vQ8,Snapshots: Hamilton Lyric Generator Using Tens...,Kaggle,https://www.youtube.com/channel/UCSNeZleDn9c74...,4266.0,6 de out. de 2020,"116 marcações ""Gostei""","6 marcações ""Não gostei""",,0.492
774,https://www.youtube.com/watch?v=Qimv6lJBnX4,Machine Learning 🤖 con Kaggle y LoRa 2.0 😲,#Kaggle,https://www.youtube.com/hashtag/kaggle,306.0,9 de mar. de 2021,"20 marcações ""Gostei""","Sem marcações ""Não gostei""",,0.494
768,https://www.youtube.com/watch?v=QK16mGnLCig,Kaggle competition meetup: M5 Forecasting - Ac...,Learn Data Science,https://www.youtube.com/channel/UCJhW_16uxALr0...,777.0,20 de set. de 2020,"23 marcações ""Gostei""","Sem marcações ""Não gostei""",,0.499
711,https://www.youtube.com/watch?v=OCwZyYH14uw,Linear Regression vs Logistic Regression | Dat...,#linearregressionvslogisticregression,https://www.youtube.com/hashtag/linearregressi...,211110.0,14 de jan. de 2019,"4.276 marcações ""Gostei""","120 marcações ""Não gostei""",,0.501
1315,https://www.youtube.com/watch?v=js6C2mLXEDw,R vs Python | Best Programming Language for Da...,#PythonVsR,https://www.youtube.com/hashtag/pythonvsr,185007.0,20 de set. de 2018,"3.054 marcações ""Gostei""","252 marcações ""Não gostei""",,0.503


### Filtrando os exemplos para novas labels

In [65]:
mask_u = (df_unlabeled['p'] >= 0.36) & (df_unlabeled['p'] <= 1.)
mask_u.sum()

68

In [66]:
df_unlabeled[mask_u].sort_values('p').head(5)

Unnamed: 0,link,title,canal_nome,canal_link,view_counts,video_date,like,dislike,y,p
1057,https://www.youtube.com/watch?v=aJoaVFEOw5U,machine learning why it is imp,TechShiva by Tanya Batra,https://www.youtube.com/channel/UC0qX3NHtD2yjz...,5.0,11 de mar. de 2021,"1 marcação ""Gostei""","Sem marcações ""Não gostei""",,0.36
1107,https://www.youtube.com/watch?v=c8uWUOSGYUI,Semi-Supervised and Unsupervised Learning Appr...,PyData,https://www.youtube.com/channel/UCOjD18EJYcsBo...,1463.0,28 de jan. de 2021,"41 marcações ""Gostei""","3 marcações ""Não gostei""",,0.362
1453,https://www.youtube.com/watch?v=oV3ZY6tJiA0,Neural Networks and Deep Learning: Crash Cours...,#CrashCourse,https://www.youtube.com/hashtag/crashcourse,200924.0,23 de ago. de 2019,"5.070 marcações ""Gostei""","114 marcações ""Não gostei""",,0.363
910,https://www.youtube.com/watch?v=W-0-u6XVbE4,Overfitting And Underfitting Machine Learning ...,#OverfittingAndUnderFittingInMachineLearning,https://www.youtube.com/hashtag/overfittingand...,3814.0,11 de set. de 2020,"77 marcações ""Gostei""","4 marcações ""Não gostei""",,0.364
947,https://www.youtube.com/watch?v=XRJ-rtP2fVE,The Julia SciML Ecosystem: Scientific Machine ...,The Julia Programming Language,https://www.youtube.com/channel/UC9IuUwwE2xdjQ...,1926.0,9 de mar. de 2021,"75 marcações ""Gostei""","Sem marcações ""Não gostei""",,0.364


In [68]:
dificeis = df_unlabeled[mask_u]
aleatorioes = df_unlabeled[~mask_u].sample(31, random_state=0)

In [70]:
pd.concat([dificeis, aleatorioes]).to_excel('active_learning.xlsx', sheet_name='links')