In [None]:
import spacy
import pandas as pd

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


import wordninja

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head()

nlp = spacy.load('en_core_web_sm')

stop_words = stopwords.words('english')


lemmatizer = WordNetLemmatizer()

In [3]:
def get_tweet_info(tweets):
    tweetTokenizer = TweetTokenizer(strip_handles=True)
    
    tokenized_tweets = []
    tokenized_tweets_no_hashtags = []
    tokenized_final_tweets = []
    for  tweet in tweets:
        
        tweet_tokens = tweetTokenizer.tokenize(tweet)
        punctuation = ['?','!',',','.',';','(',')',':','\'','-']
        #getting rid of punctuation
        tweet_tokens = [token.lower() for token in tweet_tokens if token not in punctuation]#is alpha nu merge

        tweet_tokens_no_hashtags = []
        tweet_final = []
        
        for token in tweet_tokens:
            
            #split the hashtag tokens and remove stopwords
            if token[0] == '#':
                split_hashtag = wordninja.split(token)
                
                for hashtag_token in split_hashtag:
                    if hashtag_token not in stop_words:
                        tweet_tokens_no_hashtags.append(hashtag_token)
            else:
                if token not in stop_words:
                    tweet_tokens_no_hashtags.append(token) 
                
        tokenized_tweets.append(tweet_tokens)
        tokenized_tweets_no_hashtags.append(tweet_tokens_no_hashtags)
        
        
        #lemmatize
        #final_tweet = [lemma_token.lemma_ for lemma_token in nlp(token) for token in tweet_tokens_no_hashtags ]
        final_tweet = [lemmatizer.lemmatize(token) for token in tweet_tokens_no_hashtags]
        tokenized_final_tweets.append(final_tweet)
        
    #print("original:")    
    #print(tweets[0])
    #print("tokens:")    
    #print(tokenized_tweets[0])
    #print("tokens no hashtags and no stop words:")    
    #print(tokenized_tweets_no_hashtags[0])
    #print("lemmatize:")    
    #print(tokenized_final_tweets[0])
    
    return tokenized_final_tweets

In [None]:
'''
Functia get_tweet_info va preucra datele de input astfel:
- va folosi tweet_tokenizer pentru a separa datele deoarece acestea sunt tweet-uri, deci poate fi mai uitl decat word_tokenize
- va scapa de semnele de punctuatie si va converti toate literele la litere mici
- observam ca exista multe hastag-uri in majoritatea tweet-urilor deci va speara hashtag-urile din tweet-uri deoarece 
acestea pot contine feature-uri importante ce nu vor fi vazute daca sunt pastrate hashtag-urile intregi. Folosim wordninja 
pentru a separa cuvintele din hashtag-uri
- va elimina stop_word-urile
- va aduce fiecare cuvant la forma sa din dictionar folosing lemmatizer-ul din nltk (am incercat sa folosesc lemmatizer-ul din
spacy pentru ca este mai performant, dar am avut dificultati in folosirea acestuia si am ajuns la cel din nltk)

'''

In [4]:
def get_data(filename):
    
    df = pd.read_csv(filename)
    
    tweets = df['tweet'].values
    labels = df['label'].values
    
    shuffle_stratified = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    #shuffle the data
    for train_index, test_index in shuffle_stratified.split(tweets, labels):
        X_train, X_test = tweets[train_index], tweets[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
    
    X_train = get_tweet_info(X_train)
    X_test = get_tweet_info(X_test)

    
    X_train = [" ".join(tweet) for tweet in X_train]
    X_test = [" ".join(tweet) for tweet in X_test]

   
    return X_train, X_test, y_train, y_test


In [None]:
'''
In functia get_data obtinem datele de antrenare si de testare. Vom folosi StratifiedShuffleSplit deoarece setul de data nu este 
balansat (avem mai multe date ce nu reprezinta hatred speech). Prelucram datele, iar apoi vom concatena tweet-urile inapoi 
folosind join pentru a corespunde cu input-ul necesar de la CountVectorizer
'''

In [11]:
tweet_train, tweet_test, y_train, y_test = get_data('train.csv')

count_vectorizer = CountVectorizer()


count_vectorizer.fit(tweet_train)
X_train = count_vectorizer.transform(tweet_train)
X_test = count_vectorizer.transform(tweet_test)

model = MultinomialNB(alpha=0.01)#initialize the model
model.fit(X_train, y_train)#train the model

predictions = model.predict(X_test)#predictions for the test data

print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(tweet_test[0])
print(model.predict_proba(X_test[0]))

0.9562020960425466
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5945
           1       0.70      0.67      0.68       448

    accuracy                           0.96      6393
   macro avg       0.84      0.82      0.83      6393
weighted avg       0.96      0.96      0.96      6393

happy little people ð    happyhappy people kitten cat black kitten  ¦
[[9.99999954e-01 4.61099755e-08]]


In [None]:
'''
Vom folosi CountVectorizer pentru a mapa fiecare token la o pozitie din matricea rezultata. Obtinem matricile sparse pentru train
si test. Antrenam modelul Multinomial Naive Bayes si apoi observam ca avem o acuratete de 96% pe datele de test.

Observatie: Performanta modelului nu scade daca nu despartim hashtag-urile folosind wordninja
'''