In [1]:
import csv, nltk, pickle, re, time
import pandas as pd
import numpy as np
from io import StringIO
from collections import Counter
from scipy import sparse
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# need to use once to download nltk (natural language processing library) on your computer.
# nltk.download()

Vu qu'on a une librarie qui nous permet de faire pas mal de choses, on va:
- mettre tout en minuscule
- retirer la ponctuation
- retirer tous les nombres et caractères non alphanumériques
- les tokeniser: donc séparer les mots
- retirer les stop words (and, the, etc)
- stemmer -> avoir que la racine de chaque mots

à partir de là on aura déjà un dataset plus ou moins propre =)

In [31]:
# This function does the job of extracting tweet data and formatting it to suit our needs
def format_tweets(filename):
    # Read the data file
    with open("twitter-datasets/" + filename, "r", encoding="utf8") as myfile:
        data = myfile.readlines()
        
    # Make a dataframe out of the data
    tweets = pd.DataFrame(data)
    
    # These are the words we want to ignore
    lang_set = nltk.corpus.stopwords.words('english')

    # The replacement instructions for below
    replacements = [
        ("<user>", ''),
        ("<url>", ''),
        (r'[^\w\s]', ''),
        (r'[\d]', ''),
        ("'", ' '),
        (r'(?:^| )\w(?:$| )', ' ')
    ]
        
    # Put everything in lowercase
    tweets[0] = tweets[0].astype(str).str.lower()
    
    # Take out :
    # - usertags
    # - urls
    # - punctuation and non-alphanumerical characters
    # - numbers
    for key, value in replacements:
        tweets[0] = tweets[0].str.replace(key, value)
        
    tweets[0] = tweets[0].apply(lambda x: re.sub(r'([a-zA-Z])\1{2,}', r'\1', x))
        
    # Tokenize each tweet
    tweets[0] = tweets[0].str.split()
    
    # Remove the stop words and the "rt"s (retweets)
    for item in [lang_set, 'rt']:
        tweets[0] = tweets[0].apply(lambda tweet: [word for word in tweet if word not in item])
        
    return tweets

In [32]:
negative_DF = format_tweets("train_neg.txt")
positive_DF = format_tweets("train_pos.txt")
test_DF = format_tweets("test_data.txt")

In [27]:
negative_DF.head(10)

Unnamed: 0,0
0,"[vinco, tresorpack, difficulty, object, disass..."
1,"[glad, dot, taks, tomorrow, thankful, startho]"
2,"[vs, celtics, regular, season, fucked, play, p..."
3,"[could, actually, kill, girl, im, sorry]"
4,"[find, hard, believe, im, afraid]"
5,"[wish, could, night, tonight]"
6,"[got, kicked, wgm]"
7,"[yes, tell, lips, closed, okay]"
8,[perfect]
9,"[hi, harry, havea, good, time, aus, didnt, get..."


J'ai remarké que stemmetize n'est pas toujours la meilleure chose à faire et tente du coup aussi de lemmetize. Ca nous rajoute de ce faite deux colonnes, pour qu'on puisse ensuite sélectionner la meilleure à utiliser.

In [None]:
# This function applies stemming and lemmatizing to the given tweet set
def stem_and_lem(tweets, stemmer, lemmer):
    tweets['stemmed'] = tweets[0].apply(lambda tweet: [stemmer.stem(word) for word in tweet])
    tweets['lemmed'] = tweets[0].apply(lambda tweet: [lemmatizer.lemmatize(word) for word in tweet])
    tweets['both'] = tweets['lemmed'].apply(lambda tweet: [stemmer.stem(word) for word in tweet])
    
    return tweets

In [None]:
# Generating the stemmer and lemmatizer
stemmer = nltk.stem.snowball.SnowballStemmer('english') 
lemmatizer = nltk.stem.WordNetLemmatizer()

Les lemmetizer ne semble pas amrcher à 100% donc essayons de faire les deux (donc utiliser le stem sur le lemmetizer)

In [None]:
negative_DF = stem_and_lem(negative_DF, stemmer, lemmatizer)
positive_DF = stem_and_lem(positive_DF, stemmer, lemmatizer)
test_DF = stem_and_lem(test_DF, stemmer, lemmatizer)

In [None]:
negative_DF.head(10)

In [None]:
# This function saves the processed tweets to a txt file
def save_tweets(tweets, filename):
    # Put the stemmed and lemmetized tweets back to string form
    data = tweets['both'].apply(lambda x: ' '.join(x))
    
    # Save to file
    with open("twitter-datasets/" + filename, "w", encoding="utf8") as myfile:
        data.to_csv(myfile, index=False)

In [None]:
save_tweets(negative_DF, "train_neg_proc.txt")
save_tweets(positive_DF, "train_pos_proc.txt")
save_tweets(test_DF, "test_data_proc.txt")

Maintenant, il va falloir faire ce qui est proposé dans le pdf, donc par exemple, compter les mots qui apparaissent le plus dans negatif et positif, et pourquoi pas utiliser ça ensuite dans notre algorithme pour décider si c'est positif ou négatif dans le test. =)

Quelques petites notes: 
- chaque tweet est un liste de mots
- on a quatre colonnes: les mots de base du tweet, les mots mais stemmés (racine du mot), les mots mais lemmés (idem mais se veut plus précis) et un qui fait les deux (lem puis stem)
- je pars du principe que tout est en anglais.

In [None]:
# Get embeddings and index values
emb = np.load("outputs/embeddings.npy")

vocab_cut = pd.read_csv("outputs/vocab_cut.txt", sep=" ", header=None, quoting=csv.QUOTE_NONE)
index = pd.Series(vocab_cut[vocab_cut.columns[0]].values)

# Create word definition matrix
word_weights = pd.DataFrame(data=emb, index=index)

word_weights.head()

In [None]:
# This function removes words that are not in the vocabulary from the tweets
def clean_tweets(tweets, vocab):
    clean = tweets.copy().apply(lambda tweet: [word for word in tweet if word in vocab.values])
    return clean

In [None]:
# Extract just the column we need
neg_DF = negative_DF["both"]
pos_DF = positive_DF["both"]
test_DF = test_DF["both"]

In [None]:
pos_DF.head()

<h2>Unused</h2>

In [None]:
# We want a sparse matrix that tells us for each tweet (rows) how many times a given
# word (columns) appears ("bags of words" representation)

# MIGHT NEED TO BE REDEFINED WITH neg_words AND pos_words INSTEAD !!!!!!
def bags_of_words(tweets=None):
    
    # We get the vocabulary
    with open('outputs/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
        
    col = 0
    data, rows, cols = [], [], []
    
    for tweet in tweets:
        # We count word occurences in a tweet
        word_count = Counter(tweet).items()
        
        # If the word is in the vocabulary we add it in the matrix
        for word, count in word_count:
            row = vocab.get(word)
            
            if row:
                data.append(count)
                cols.append(col)
                rows.append(row)
        
        col += 1
        
    # We convert the scipy.sparse matrix to pandas.SparseSeries for ease of use
    return pd.SparseSeries.from_coo(sparse.coo_matrix((data, (rows, cols))))

In [None]:
bags = bags_of_words(pos_DF)

In [None]:
bags.head()

<h2>Random Forest Classifier</h2>

In [None]:
# Sums the embeddings of each word in the given tweet

# THIS METHOD IS NOT OPTIMAL AND WOULD BENEFIT BEING MADE FASTER UNLESS WE DO NOT USE IT IN THE END
def query_weights(tweet):
    w = pd.DataFrame(columns=range(20))
    
    for word in tweet:
        try:
            w = w.append(word_weights.loc[word, :])
        except KeyError:
            pass
        
    return w.sum(axis=0)

In [None]:
# Build tweet embeddings
neg_dims = neg_DF.copy().apply(query_weights)
pos_dims = pos_DF.copy().apply(query_weights)
test_dims = test_DF.copy().apply(query_weights)

In [None]:
# Save the embeddings in pkl files
with open('outputs/neg_dims.pkl', 'wb') as f:
    pickle.dump(neg_dims, f, pickle.HIGHEST_PROTOCOL)
    
with open('outputs/pos_dims.pkl', 'wb') as f:
    pickle.dump(pos_dims, f, pickle.HIGHEST_PROTOCOL)
                
with open('outputs/test_dims.pkl', 'wb') as f:
    pickle.dump(test_dims, f, pickle.HIGHEST_PROTOCOL)            

In [None]:
# Load the embeddings from pkl files
with open('outputs/neg_dims.pkl', 'rb') as f:
    neg_dims = pickle.load(f)
    
with open('outputs/pos_dims.pkl', 'rb') as f:
    pos_dims = pickle.load(f)
    
with open('outputs/test_dims.pkl', 'rb') as f:
    test_dims = pickle.load(f)

In [None]:
# Generate the matrices for SVM fitting, we just put the positive and negative embeddings together and
# create the appropriate y matrix with 1's and -1's
X = pos_dims.append(neg_dims)
ones = np.ones((pos_dims.shape[0], 1))
y = np.append(ones, -1 * ones)

In [None]:
# Applies the Random Forest Classifier technique to the data
start = time.time()
clf = RandomForestClassifier(min_samples_leaf=20)
clf.fit(X, y)
end = time.time()
print("Random Forest", end - start, clf.score(X, y))
pred = pd.DataFrame(clf.predict(test_dims))
pred.columns = ["Prediction"]
pred.insert(0, "Id", pred.index + 1)

In [None]:
pred.head()

In [None]:
# We save the submission
pred.to_csv("outputs/submission.csv", index=False, float_format="%.0f")

In [None]:
# Apply cross validation to the data
scores = cross_val_score(clf, X, y, cv=10)
scores