In [6]:
import pandas as pd
import numpy as np
import csv, nltk

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import confusion_matrix, classification_report

from nltk.corpus import stopwords

In [7]:
TABLE_PATH = "../../../datasets/twitter-filtered.csv"

In [16]:
twitter = pd.read_csv(TABLE_PATH, quoting=csv.QUOTE_ALL)[["user_id", "tweet_content"]]

authors_names = list(twitter["user_id"].unique())

author_name_to_idx = dict(zip(authors_names, range(0, len(authors_names))))

twitter.user_id = twitter.user_id.apply(lambda x: author_name_to_idx[x])



In [18]:
# def count_instances_per_author(df):
# twitter['counts'] = np.zeros(len(twitter))
# authors = twitter.user_id.unique()
# counts = twitter.groupby(["user_id"]).count().tweet_content

# for i in range(len(counts)):
# i = 0

twitter['counts'] = twitter.groupby(['user_id'])['tweet_content'].transform('count')



In [19]:
twitter

Unnamed: 0,user_id,tweet_content,counts
0,0,@braverbeast I don't think you're an annoying ...,2140
1,0,"@cgpgrey And with that, the ""grey vs gray"" dis...",2140
2,0,Pls turn off the lights I wanna go to bed http...,2140
3,0,@keltbh @ipkipi stylenon,2140
4,0,"Just a terrific mobile app you've got there, @...",2140
...,...,...,...
972019,709,honestly one of the best kpop albums i've ever...,556
972020,709,probably my most played album of 2018 oof,556
972021,709,only M.I.A stans can see this tweet,556
972022,709,but have you watched no mercy https://t.co/euw...,556


In [8]:
twitter.to_csv(path_or_buf="../../../datasets/twitter_interest.csv", index=False, quoting=csv.QUOTE_ALL)

In [26]:
unique_names = twitter.user_id.unique()
df = twitter

botton = 3000
top =    1_000_000

for name in unique_names:
    qtd = len(df[df.user_id == name])
    if qtd > top or qtd < botton:
        df = df[df.user_id != name]

print(f"Quantidade de autores: {len(df.user_id.unique())}")

Quantidade de autores: 51


In [27]:
df

Unnamed: 0,user_id,tweet_id,tweet_lang,is_RT,tweet_content
2140,1009089331,1069949786827177984,en,False,WWE NEEDS TO CHANGE FAST @SkyNewsB...
2141,1009089331,1069950001000996866,en,False,PRIVACY IS VERY IMPORTANT SO WE DON'T GE...
2142,1009089331,1069950714837901313,en,False,BYE NOW @SkyNewsBreak
2143,1009089331,1069953972444762112,en,False,BREXIT MY BRAIN IS SCREAMING NO MORE PL...
2144,1009089331,1069954341648384000,en,False,WHAT DO WANT TO ANTHONY WHAT'S BRAIN...
...,...,...,...,...,...
965454,975363805,1085519137521258496,en,False,Human muscles are limited by our brain – we ac...
965455,975363805,1085519141652611072,en,False,Lack of sleep can cause weight gain of 2 pound...
965456,975363805,1085522935371218944,en,False,"Bad breath is sometimes called Halitosis, comi..."
965457,975363805,1085526645975539712,en,False,"The inability to pronounce the letter ""r"" is c..."


In [13]:
def pipeline(df, 
             test_size: int = 0.2,
             x:str = "message",
             y:str = "author",
             word_re: str = r'\b\w+\b',
             vectorizer = TfidfVectorizer,
             selector = SelectKBest,
             K:int = 1000,
             ngram_range: tuple = (2,2),
             classifier = MultinomialNB):
    
    
    train, test = train_test_split(df, test_size=test_size)
    print(f"Tamanho do treino: {len(train)}\nTamanho do teste: {len(test)}")
    
    print("Extração de características")
    train_X = train[x]
    train_Y = train[y]
    test_X  = test[x]
    test_Y  = test[y]
    
    
    vectorizer_f = vectorizer(ngram_range=ngram_range, token_pattern=word_re).fit(train_X)
    
    train_X = vectorizer_f.transform(train_X)
    test_X = vectorizer_f.transform(test_X)  
    print("Shape: ", train_X.shape, test_X.shape)
    
    print("Seleção de características")
    selector_f = selector(chi2, k=K).fit(train_X, train_Y)
    train_X = selector_f.transform(train_X)
    test_X  = selector_f.transform(test_X)
    
    print("Classificação")
    clf = classifier()
    clf.fit(train_X, train_Y)
    pred = clf.predict(test_X)
    print(f"score {clf.score(test_X, test_Y)}")
    
    
    return pred, test_Y

In [28]:
pred, test_Y = pipeline(df, x="tweet_content", y="user_id", K=16000)

Tamanho do treino: 127870
Tamanho do teste: 31968
Extração de características
Shape:  (127870, 462559) (31968, 462559)
Seleção de características
Classificação
score 0.757444944944945


In [29]:
print(classification_report(pred, test_Y))

                    precision    recall  f1-score   support

          22554465       0.28      0.64      0.39       263
          24022029       0.90      0.94      0.92       637
          24363761       0.94      0.57      0.71      1078
          36513758       0.76      0.97      0.85       507
          45973908       0.99      0.96      0.98       629
          49218895       0.90      0.87      0.88       683
          63288061       0.83      0.81      0.82       602
          64121310       0.98      0.85      0.91       706
          83284284       0.78      0.98      0.87       476
         103244023       0.59      0.86      0.70       411
         121585667       1.00      0.71      0.83       869
         141720091       1.00      0.92      0.96       678
         177040478       0.73      0.91      0.81       473
         356664315       0.50      0.79      0.61       386
         391127889       0.83      0.90      0.87       603
         436792718       0.95      0.93