In [1]:
import pandas as pd
df = pd.read_pickle("preprocessed.pkl")

In [2]:
df = df[df['clean'] != '']
df = df.reset_index()

In [3]:
# Create tfids scores per sentence
# TFIDF
# TF-IDF (term frequency-inverse document frequency) is a statistical measure 
# that evaluates how relevant a word is to a document in a collection of documents.

# This is done by multiplying two metrics: how many times a word appears in a document,
# and the inverse document frequency of the word across a set of documents.

# So, words that are common in every document, such as this, what, and if, 
# rank low even though they may appear many times, 
# since they don‚Äôt mean much to that document in particular.
# However, if the word Bug appears many times in a document,
# while not appearing many times in others, it probably means that it‚Äôs very relevant. 
# closer it is to 0, the more common a word is. 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(df.clean)
features = pd.Series(tfidf.get_feature_names_out())
transformed = tfidf.transform(df.clean)



In [4]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.clean.split()))

In [5]:
replaced_tfidf_scores = df.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

In [6]:
df_features = pd.DataFrame(data=[df.clean, replaced_tfidf_scores]).T
df_features = df_features.rename(columns={"Unnamed 0": "tfidf"})

In [7]:
# Pad tfidf
from keras_preprocessing.sequence import pad_sequences

sequences =  df_features['tfidf']

# Find the maximum length 
max_len = 0
for vector in sequences:
    if len(vector) > max_len: max_len = len(vector) 

print(max_len)
padded = pad_sequences(sequences, maxlen=max_len)

405


In [8]:
listPadded = []
for i in range(0,len(padded)):
    listPadded.append(list(padded[0]))

In [9]:
df_features['tfidf_padded'] = listPadded

In [24]:
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# List of sentences
doc = df["clean"]

# Tokenization of each document
tokenized_doc = []
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 100, window = 2, min_count = 1, epochs = 100)

In [12]:
# Load Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
model = Doc2Vec.load("d2v_model.mod")

In [14]:
# Add Doc2Vec in df_features
import numpy as np
from nltk.tokenize import word_tokenize

vectors = model.wv
df_features['vectors'] = df_features.clean.apply(lambda x: model.infer_vector(word_tokenize(x)))

In [15]:
# Add vader columns
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
for text in df['clean']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df_features['compound'] = compound
df_features['neg'] = neg
df_features['pos'] = pos

In [16]:
# Add polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
for text in df['clean']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df_features['polarity'] = polarity
df_features['subjectivity'] = subjectivity

In [17]:
# Add each in one vector
features = []
for index, row in df_features.iterrows():
    featuresRow = []
    for column in df_features.columns:
        if column == 'clean' or column == 'tfidf': continue
        if column == 'vectors' or column == 'tfidf_padded': 
            for i in list(row[column]):
                featuresRow.append(i)
            continue
        featuresRow.append(row[column])
    features.append(featuresRow)

In [18]:
len(features[0])

510

In [19]:
# CLUSTERING
from sklearn.cluster import KMeans

# n_init equals the lenth of the word2vec vectors
kmeans = KMeans(n_clusters=2, max_iter=10000, random_state=True, n_init=len(features[0])).fit(X=features)
kmeans.labels_

array([0, 0, 0, ..., 1, 0, 0])

In [31]:
scores = list(kmeans.labels_)

In [36]:
negatives = []
for i in range(0,len(scores)):
    if scores[i]==1: negatives.append(i)

In [38]:
with open('kmeans_negatives.txt','w') as tfile:
    tfile.write(str(negatives))

In [24]:
import pickle
# Save model
with open("kmeans.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [2]:
import pickle
# load model
with open("kmeans.pkl", "rb") as f:
    kmeans = pickle.load(f)

### Investigate Kmeans results

In [21]:
import pandas as pd
df = pd.read_pickle("preprocessed.pkl")
df = df[df['clean'] != '']
df = df.reset_index()
df['prediction'] = list(kmeans.labels_)

In [35]:
len(df[df['prediction']==1]['text']) # out of 308 371 negative are 28919 ,(76 426)

28919

In [23]:
list(df[df['prediction']==1]['text'])

['RT @__TheCatalyst: Scientists of NASA went to JP Morgan Chase Bank which is known to fund the most fossil fuel projects in the world to tel‚Ä¶',
 'RT @JunkScience: NYTimes @PaulKrugman:\n\n"Such heat waves have happened before, but climate change has made them increasingly common."\n\nReal‚Ä¶',
 "@cryptoworld202 You're missing out on $ZRO! \n\nLet's work together to reduce the effects of global warming! With the help of @carb0nfi!\n\nHelp discover a solution to global warming by using your crypto assets! By earning $ZRO tokens, you're helping the environment at the same time!üíö https://t.co/fkR0RKyQcG",
 'RT @markets: The wonder metal driving the electric vehicle market is in a supply crisis that could hurt efforts to stop global warming (via‚Ä¶',
 'markets: The wonder metal driving the electric vehicle market is in a supply crisis that could hurt efforts to stop global warming (via @climate) https://t.co/ubem6ibVxh',
 'RT @BradPorcellato: #NoosaHeads #Georgia #California, Climate 