# STEP 5a: Apply Best Selected Model on the Master Test Dataset - Data pre-processing and NLP transformation
> * Processing numercial and categorical features (as per Step 2)
* Processing and transforming text feature Snippet according to NLP methodology defined in Step 3
* Consolidating data and saving dataset for modeling in a csv file
* **Warning**: It requires to download and save the reference file for the Word2vec processing in order to perform tasks in this Notebook
* Link to the dataset source: https://archive.ics.uci.edu/ml/datasets/News+Popularity+in+Multiple+Social+Media+Platforms#

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.fftpack as sp

%matplotlib inline
import matplotlib.pyplot as plt
from adjustText import adjust_text

import re
from sklearn.preprocessing import Normalizer
import os
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import QuantileTransformer
from sklearn.naive_bayes import ComplementNB, MultinomialNB

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz
from sklearn.datasets import make_circles
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
import scikitplot as skplt

# t_NSE dimensionality reduction
from sklearn.manifold import TSNE

# NLP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import gensim
import gensim.parsing.preprocessing as gsp
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

import random
from sklearn import ensemble

from sklearn.model_selection import StratifiedShuffleSplit

import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

# Activate Seaborn style
sns.set()

## Load the file

In [2]:
# Importing the file and creating a dataframe
df_master = pd.read_csv(
    "Master_test_final.csv", low_memory=False, skipinitialspace=True
)

In [3]:
# display all columns
pd.set_option("display.max_columns", None)

In [4]:
# remove the Unnamed column
df_master.drop("Unnamed: 0", axis=1, inplace=True)
df_master.shape

(20000, 26)

In [5]:
df_master.head()

Unnamed: 0,Post_ID,Date,Month,Company,Snippet,TW_Hashtags,Type_Channel,Country,Country 2,Sentiment,ALL_Author,ALL_Impact,ALL_Backlinks,ALL_Thread_Entry_Type,ALL_Tot_Monthly_Vis,ALL_MozRank,TW_Account_Type,TW_Account_Name,TW_NbImpressions,TW_KredInfluence,TW_KredOutreach,TWFB_NbReach,TW_NbFollowers,TW_NbFollowing,TW_Account_Retweet_of,TW_NbTweets
0,883320256,2017-04-28,April,Thomson Reuters,legal education affect market tmsnrt,No TW hashtag,twitter,United Kingdom,United Kingdom,neutral,TRLegalUKI,63,49850734,post,6000000000.0,9.6,organisational,tw account not identified,12204.0,621.0,0.0,1348,1530.0,825.0,not a rt,4124.0
1,747534914,2017-01-05,January,Thomson Reuters,slovenia toughens prevent another influx migra...,No TW hashtag,twitter,Argentina,Argentina,neutral,INFOS_EN,53,49850734,post,6000000000.0,9.6,organisational,infos_en,5772.0,798.0,7.0,798,5772.0,81.0,not a rt,387162.0
2,210103078,2017-04-02,April,Thomson Reuters,zambia power short zambia launches switch perc...,No TW hashtag,twitter,United States,United States,neutral,BrlttanyDuguay,28,49850734,share,6000000000.0,9.6,Not identified,tw account not identified,44.0,410.0,0.0,410,44.0,2239.0,anthoniaorji,563.0
3,908485890,2017-01-26,January,Thomson Reuters,congrats deserved award winner gwgia twitter a...,gwgia,twitter,United States,United States,positive,WomensLeadrshp,28,49850734,post,6000000000.0,9.6,organisational,tw account not identified,50.0,417.0,0.0,417,50.0,63.0,not a rt,190.0
4,701086584,2017-05-22,May,Thomson Reuters,dozen attorneys general defend subsidy payment...,"insurance, obamacare",twitter,United States,United States,neutral,StephOliverTR5,21,49850734,post,6000000000.0,9.6,individual,tw account not identified,10.0,325.0,0.0,325,10.0,81.0,not a rt,209.0


## Cleaning the dataframe based on Step 2 tasks (numerical and categorical features)

In [6]:
# remove the following columns: ALL_Backlinks, ALL_MozRank, ALL_Tot_Monthly_Vis, TW_KredInfluence, TW_NbImpressions, Type_Channel, Month
df_master.drop(
    [
        "ALL_Backlinks",
        "ALL_MozRank",
        "ALL_Tot_Monthly_Vis",
        "TW_KredInfluence",
        "TW_NbImpressions",
        "Type_Channel",
        "Month",
    ],
    axis=1,
    inplace=True,
)
df_master.shape

(20000, 19)

### Numerical features

In [7]:
# select the integer columns
df_num = df_master.select_dtypes(include=[np.int64, np.float64])
# df_num.drop('Post_ID', axis=1, inplace=True)

In [8]:
df_num.shape

(20000, 7)

In [9]:
# Add a Log1p variables to the DataFrame
df_num["Log_TW_KredOutreach"] = np.log1p(df_num["TW_KredOutreach"])
df_num["Log_Nbreach"] = np.log1p(df_num["TWFB_NbReach"])
df_num["Log_TW_NbFollowers"] = np.log1p(df_num["TW_NbFollowers"])
df_num["Log_TW_NbFollowing"] = np.log1p(df_num["TW_NbFollowing"])
df_num["Log_TW_NbTweets"] = np.log1p(df_num["TW_NbTweets"])
df_num.head()

Unnamed: 0,Post_ID,ALL_Impact,TW_KredOutreach,TWFB_NbReach,TW_NbFollowers,TW_NbFollowing,TW_NbTweets,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets
0,883320256,63,0.0,1348,1530.0,825.0,4124.0,0.0,7.207119,7.333676,6.716595,8.324821
1,747534914,53,7.0,798,5772.0,81.0,387162.0,2.079442,6.683361,8.660947,4.406719,12.866601
2,210103078,28,0.0,410,44.0,2239.0,563.0,0.0,6.018593,3.806662,7.714231,6.335054
3,908485890,28,0.0,417,50.0,63.0,190.0,0.0,6.035481,3.931826,4.158883,5.252273
4,701086584,21,0.0,325,10.0,81.0,209.0,0.0,5.786897,2.397895,4.406719,5.347108


In [10]:
# Add a Log1p variables to the DataFrame
df_num["Log_TW_KredOutreach"] = np.log1p(df_num["TW_KredOutreach"])
df_num["Log_Nbreach"] = np.log1p(df_num["TWFB_NbReach"])
df_num["Log_TW_NbFollowers"] = np.log1p(df_num["TW_NbFollowers"])
df_num["Log_TW_NbFollowing"] = np.log1p(df_num["TW_NbFollowing"])
df_num["Log_TW_NbTweets"] = np.log1p(df_num["TW_NbTweets"])
df_num.head()

Unnamed: 0,Post_ID,ALL_Impact,TW_KredOutreach,TWFB_NbReach,TW_NbFollowers,TW_NbFollowing,TW_NbTweets,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets
0,883320256,63,0.0,1348,1530.0,825.0,4124.0,0.0,7.207119,7.333676,6.716595,8.324821
1,747534914,53,7.0,798,5772.0,81.0,387162.0,2.079442,6.683361,8.660947,4.406719,12.866601
2,210103078,28,0.0,410,44.0,2239.0,563.0,0.0,6.018593,3.806662,7.714231,6.335054
3,908485890,28,0.0,417,50.0,63.0,190.0,0.0,6.035481,3.931826,4.158883,5.252273
4,701086584,21,0.0,325,10.0,81.0,209.0,0.0,5.786897,2.397895,4.406719,5.347108


### Categorical features

In [11]:
# Create a dataframe with categorical features
df_cat = df_master[
    [
        "Post_ID",
        "Company",
        "Country 2",
        "Sentiment",
        "ALL_Thread_Entry_Type",
        "TW_Account_Type",
    ]
]

#### One-Hot encoding of following features: Company, Country 2, ALL_Thread_Entry_Type, TW_Account_type

In [12]:
df_cat_nom = df_cat[
    ["Company", "Country 2", "ALL_Thread_Entry_Type", "TW_Account_Type"]
]

In [13]:
encode_norm = pd.get_dummies(df_cat_nom)

In [14]:
encode_norm.shape

(20000, 38)

In [15]:
# Transform dtype from uint8 to int64
encode_norm = encode_norm.astype(np.int64)

#### Encode ordinal feature: sentiment

In [16]:
df_cat = df_cat.replace({"Sentiment": {"negative": -1, "neutral": 0, "positive": 1}})

In [17]:
df_sentiment_encod = df_cat[["Sentiment"]]

### Concatenate different datasets created for feature engineering in one dataset

In [18]:
# Merge encoded nominal and ordinal features
encode_tot = pd.merge(
    df_sentiment_encod, encode_norm, right_index=True, left_index=True
)
encode_tot.shape

(20000, 39)

In [19]:
# Merge encoded features with log_transform features
df_feateng = pd.merge(encode_tot, df_num, right_index=True, left_index=True)

In [20]:
df_feateng.shape

(20000, 51)

In [21]:
# Select features to be added from the master file (twitter_eda)
select_feat = df_master[
    ["Date", "Snippet", "TW_Hashtags", "ALL_Author", "TW_Account_Name"]
]

### Masterdata_raw: Reference dataset with all numerical and categorical features

In [22]:
# Merge dataset with all feautre engineered and the missing features from the original master file
masterdata_raw2 = pd.merge(df_feateng, select_feat, right_index=True, left_index=True)

In [23]:
masterdata_raw2.shape

(20000, 56)

### Masterdata_ml2: Dataframe with selected features eligible for the modeling phase

In [24]:
masterdata_ml2 = masterdata_raw2.drop(
    [
        "TW_KredOutreach",
        "TWFB_NbReach",
        "TW_NbFollowers",
        "TW_NbFollowing",
        "TW_NbTweets",
        "Date",
    ],
    axis=1,
)

In [25]:
masterdata_ml2.shape

(20000, 50)

In [26]:
masterdata_ml2.head()

Unnamed: 0,Sentiment,Company_Clarivate,Company_Informa,Company_Pearson,Company_RELX Group,Company_Thomson Reuters,Company_Wolters Kluwer,Country 2_Argentina,Country 2_Australia,Country 2_Belgium,Country 2_Brazil,Country 2_Canada,Country 2_Ecuador,Country 2_France,Country 2_Germany,Country 2_Hong Kong,Country 2_India,Country 2_Italy,Country 2_Japan,Country 2_Mexico,Country 2_Netherlands,Country 2_Other,Country 2_Philippines,Country 2_Russia,Country 2_Serbia,Country 2_Singapore,Country 2_South Africa,Country 2_Spain,Country 2_Switzerland,Country 2_United Arab Emirates,Country 2_United Kingdom,Country 2_United States,Country 2_Venezuela,ALL_Thread_Entry_Type_post,ALL_Thread_Entry_Type_reply,ALL_Thread_Entry_Type_share,TW_Account_Type_Not identified,TW_Account_Type_individual,TW_Account_Type_organisational,Post_ID,ALL_Impact,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets,Snippet,TW_Hashtags,ALL_Author,TW_Account_Name
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,883320256,63,0.0,7.207119,7.333676,6.716595,8.324821,legal education affect market tmsnrt,No TW hashtag,TRLegalUKI,tw account not identified
1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,747534914,53,2.079442,6.683361,8.660947,4.406719,12.866601,slovenia toughens prevent another influx migra...,No TW hashtag,INFOS_EN,infos_en
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,210103078,28,0.0,6.018593,3.806662,7.714231,6.335054,zambia power short zambia launches switch perc...,No TW hashtag,BrlttanyDuguay,tw account not identified
3,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,908485890,28,0.0,6.035481,3.931826,4.158883,5.252273,congrats deserved award winner gwgia twitter a...,gwgia,WomensLeadrshp,tw account not identified
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,701086584,21,0.0,5.786897,2.397895,4.406719,5.347108,dozen attorneys general defend subsidy payment...,"insurance, obamacare",StephOliverTR5,tw account not identified


### Save Masterdata_mlto a csv file

In [27]:
# masterdata_ml.to_csv('masterdata_ml.csv')

## Preparing text feature based on Step 3 procedure

### Create a dataframe with selected text feature used for modeling

In [28]:
snippets = masterdata_ml2["Snippet"].astype(str)

In [29]:
snippets.head()

0                 legal education affect market tmsnrt
1    slovenia toughens prevent another influx migra...
2    zambia power short zambia launches switch perc...
3    congrats deserved award winner gwgia twitter a...
4    dozen attorneys general defend subsidy payment...
Name: Snippet, dtype: object

### Snippet pre-processing

In [30]:
# Create a list of strings, where each string is an article title
snippet_list = [snippet for snippet in snippets]

big_list = " ".join(snippet_list)

# Tokenize the string into words
tokens = word_tokenize(big_list)

# Remove non-alphabetic tokens, such as punctuation
words = [word.lower() for word in tokens if word.isalpha()]

words = [word for word in words if not word in stop_words]

# Print first 10 words
words[:10]

['legal',
 'education',
 'affect',
 'market',
 'tmsnrt',
 'slovenia',
 'toughens',
 'prevent',
 'another',
 'influx']

### Load Word2Vec model trained on the large corpus: GoogleNews (using Gensim library)
* Library documentation: https://radimrehurek.com/gensim/models/keyedvectors.html
* Link to the dataset source: https://archive.ics.uci.edu/ml/datasets/News+Popularity+in+Multiple+Social+Media+Platforms#

In [31]:
# Load Word2Vec model
model = gensim.models.KeyedVectors.load_word2vec_format(
    "C:/Users/fbaff/Documents/Data Science/Dataset/GoogleNews-vectors-negative300.bin",
    binary=True,
)

### Perform snippet embeding via averaging word embeding for each of them

In [32]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.vocab]
    return np.mean(model[doc], axis=0)


# Pre-processing to keep each snippet as a doc (not individual words)
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc


# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)


# Filter out documents
def filter_docs(corpus, texts, condition_on_doc):
    """
    Filter corpus and texts given the function condition_on_doc which takes
    a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus) if condition_on_doc(doc)]

    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts)

In [33]:
# Preprocess the corpus
corpus = [preprocess(title) for title in snippet_list]

# Remove docs that don't include any words in W2V's vocab
corpus, titles_list = filter_docs(
    corpus, snippet_list, lambda doc: has_vector_representation(model, doc)
)

# Filter out any empty docs
corpus, titles_list = filter_docs(corpus, snippet_list, lambda doc: (len(doc) != 0))

82 docs removed
0 docs removed


In [34]:
# Initialize an array for the size of the corpus
x = []
for doc in corpus:  # append the vector for each document
    x.append(document_vector(model, doc))

X = np.array(x)  # list to array

In [35]:
# Create a dataframe with full document embedding and the dataframe resulting from the EDA process
df_all_vectors = pd.DataFrame(x)

df_all_vectors["Title"] = titles_list

main_all_vectors2 = pd.concat((df_all_vectors, masterdata_ml2), axis=1)

# Get rid of vectors that couldn't be matched with the original dataframe
main_all_vectors2.dropna(axis=0, inplace=True)

main_all_vectors2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,Title,Sentiment,Company_Clarivate,Company_Informa,Company_Pearson,Company_RELX Group,Company_Thomson Reuters,Company_Wolters Kluwer,Country 2_Argentina,Country 2_Australia,Country 2_Belgium,Country 2_Brazil,Country 2_Canada,Country 2_Ecuador,Country 2_France,Country 2_Germany,Country 2_Hong Kong,Country 2_India,Country 2_Italy,Country 2_Japan,Country 2_Mexico,Country 2_Netherlands,Country 2_Other,Country 2_Philippines,Country 2_Russia,Country 2_Serbia,Country 2_Singapore,Country 2_South Africa,Country 2_Spain,Country 2_Switzerland,Country 2_United Arab Emirates,Country 2_United Kingdom,Country 2_United States,Country 2_Venezuela,ALL_Thread_Entry_Type_post,ALL_Thread_Entry_Type_reply,ALL_Thread_Entry_Type_share,TW_Account_Type_Not identified,TW_Account_Type_individual,TW_Account_Type_organisational,Post_ID,ALL_Impact,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets,Snippet,TW_Hashtags,ALL_Author,TW_Account_Name
0,-0.138428,-0.001816,-0.094147,-0.06778,0.02597,0.033813,0.153992,0.030151,0.143555,-0.002014,-0.060566,-0.019653,-0.105133,0.110291,-0.145081,0.074402,0.076069,0.164062,0.085945,-0.117432,-0.030182,0.036865,0.077515,0.047623,-0.047241,-0.006287,-0.124878,0.029999,-0.014572,0.034546,0.074783,-0.128784,0.033325,-0.019424,-0.014988,-0.050194,0.038574,0.126709,0.039429,-0.017273,-0.025116,0.051147,0.066839,0.020996,-0.204346,-0.10446,0.114651,0.086975,0.012146,0.053604,0.069885,-0.046417,-0.036331,0.003418,0.001617,0.047729,-0.13858,-0.00885,0.042419,-0.139648,-0.076782,-0.106018,0.070251,0.004883,-0.06382,-0.006165,-0.002777,0.13855,-0.02124,-0.050659,-0.080811,-0.001167,0.229248,0.077187,-0.018524,-0.099266,0.133667,0.09082,-0.006561,0.005432,-0.046799,-0.053297,-0.08551,0.131714,-0.066681,0.065979,-0.113937,0.167267,-0.072144,-0.023148,0.364502,-0.130798,0.001923,-0.012207,0.0224,-0.132309,0.091675,-0.052185,0.148911,0.051208,0.023056,0.094238,-0.066589,0.051221,-0.082222,-0.156311,-0.048859,0.025757,0.098572,-0.231934,0.003052,0.028931,-0.167236,0.02356,0.130112,-0.03183,0.011963,-0.057617,0.189087,0.021606,-0.033417,0.081055,-0.078444,-0.006195,-0.038208,-0.072754,-0.16748,-0.033936,-0.050157,0.010376,0.132202,-0.013321,-0.171143,-0.048309,0.03598,-0.074005,0.004211,0.035095,0.057941,0.011475,-0.026855,-0.011841,-0.084412,0.116455,-0.064774,-0.078735,0.200439,0.014816,-0.117486,-0.029541,0.147957,0.024048,-0.075333,-0.042768,-0.109776,-0.041073,-0.059814,-0.21283,-0.122742,-0.10221,0.015442,-0.016171,0.127441,0.037109,0.099136,0.062531,0.070763,-0.082703,0.13578,-0.108429,-0.156799,-0.030884,0.082245,-0.073799,0.073853,-0.093262,0.139313,0.01886,-0.042843,0.139038,-0.19986,-0.13855,-0.013519,-0.049561,0.004272,-0.010231,0.006226,0.125061,0.044098,0.145905,-0.013123,-0.03363,-0.064331,0.030533,-0.118195,0.02383,-0.118866,-0.120972,0.137676,-0.098999,-0.089844,0.010254,0.087921,-0.083313,-0.110107,-0.065796,-0.004395,-0.071716,-0.039185,-0.039413,-0.014481,-0.017151,-0.034546,0.071167,-0.015137,0.06958,0.155396,0.026917,-0.126953,-0.016541,-0.164444,-0.009003,0.036316,0.057556,0.093384,-0.087158,0.12146,-0.057526,-0.021362,0.159332,0.107956,0.012024,0.031433,-0.019339,0.130859,-0.035522,0.098755,-0.062256,0.05307,-0.049805,-0.021805,-0.073044,-0.030945,0.055115,-0.045654,-0.131042,-0.096313,-0.015869,0.067871,-0.109314,0.003296,0.057434,0.036362,0.076416,-0.047974,-0.226807,-0.003746,0.040283,-0.007446,-0.019897,-0.073853,-0.142334,-0.071121,0.067749,0.035118,0.180664,-0.068176,-0.058487,-0.05603,-0.070667,-0.078369,0.092346,0.084152,-0.073486,-0.049774,-0.085266,0.000381,-0.095215,-0.049133,-0.042297,-0.014374,0.011078,0.079987,0.151772,-0.042557,0.021179,-0.068787,0.054382,0.088135,0.032166,-0.003906,0.018005,-0.00589,-0.069153,0.023254,0.140564,-0.025146,-0.038391,0.123535,0.072601,legal education affect market tmsnrt,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,883320256,63,0.0,7.207119,7.333676,6.716595,8.324821,legal education affect market tmsnrt,No TW hashtag,TRLegalUKI,tw account not identified
1,0.01223,0.079514,-0.024948,-0.031738,-0.101898,-0.053185,-0.04071,-0.025894,-0.07827,0.042047,0.035556,-0.170837,-0.034592,0.095093,-0.148304,0.000458,-0.001228,0.114738,-0.07196,-0.103394,0.087502,0.035852,0.094818,-0.01947,0.055298,-0.126808,0.026001,0.11441,0.000872,0.006126,0.010757,-0.073456,-0.065178,-0.127716,-0.040588,-0.105072,-0.075645,0.09375,0.04781,0.11412,0.007721,-0.163528,0.166695,0.061432,0.020233,-0.179771,-0.082367,0.088745,-0.02771,-0.002403,0.022614,0.084351,-0.013672,-0.076481,-0.00956,-0.091282,-0.114663,0.004074,-0.034836,-0.152885,-0.063278,-0.075369,-0.088135,-0.075325,0.026508,-0.046631,-0.075539,-0.033905,-0.035309,0.126303,-0.004669,0.010712,0.093079,0.099464,-0.136566,-0.00169,0.057037,0.110565,0.022949,0.093224,0.014557,-0.015671,-0.064545,-0.046052,-0.018133,0.042114,-0.068008,0.077797,0.02658,-0.032639,0.087334,0.035141,-0.009071,-0.062271,0.06189,-0.038643,0.065968,-0.002701,0.205261,-0.013199,-0.082367,-0.022415,-0.036636,0.044029,-0.047933,-0.026909,-0.030411,-0.025757,0.077404,-0.021263,-0.041473,-0.01433,0.044418,-0.053146,0.006424,0.009476,-0.037888,0.034901,0.00589,0.135803,-0.023117,0.032211,-0.088371,0.010429,0.080292,-0.149841,-0.092438,0.08107,0.079178,-0.011711,0.023178,-0.062294,-0.017303,0.01297,-0.068413,-0.047974,0.02672,0.012559,0.048462,0.036625,0.112793,-0.072754,0.084969,0.172897,0.045563,-0.027542,0.009449,-0.00293,-0.024445,-0.090637,0.200836,-0.108948,-0.104984,-0.011444,0.07209,-0.060364,-0.057663,-0.031258,-0.014954,0.00975,0.003052,-0.059662,-0.05825,-0.186882,0.045441,-0.041641,0.026895,-0.153259,0.012329,0.08934,-0.14492,0.077637,-0.080582,0.014709,0.085083,0.033218,0.149462,-0.019337,0.039932,0.018921,-0.103668,-0.12178,0.050858,-0.096069,-0.008972,0.075195,-0.059399,0.08149,0.082214,-0.00016,0.015419,0.003372,-0.041994,0.058716,-0.103247,0.02133,-0.108712,0.001495,-0.05101,-0.089956,0.142014,0.157181,-0.000557,-0.045532,-0.044365,0.046703,0.073212,-0.031349,0.054779,-0.059875,0.01236,0.040222,-0.094055,-0.011932,-0.080734,0.020987,0.093311,0.016418,-0.059048,0.070557,-0.148712,0.011385,0.022629,0.045837,0.007473,-0.148666,0.096436,0.067932,0.033939,0.10321,-0.010032,0.069897,0.020805,0.038635,0.003342,-0.096161,0.056702,-0.067032,0.004288,-0.058086,0.044281,0.044708,0.031986,0.025101,0.00193,0.027084,0.031891,0.091881,-0.02388,-0.144287,0.128983,0.004967,-0.064434,0.138817,0.008114,0.01001,0.104889,-0.081482,-0.075623,0.030701,0.081711,0.040955,0.037048,0.042625,0.041931,0.025215,-0.08667,-0.118126,-0.046005,-0.174194,0.059845,0.058914,0.035051,-0.004566,-0.004944,-0.007263,-0.016983,-0.08506,-0.007736,0.025848,-0.107086,0.055332,-0.070644,0.076895,-0.023941,0.016251,-0.081665,0.080032,0.063677,0.079437,-0.05072,-0.001709,0.004986,0.00108,0.019249,0.150398,-0.107834,-0.02179,0.008972,0.042465,slovenia toughens prevent another influx migra...,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,747534914,53,2.079442,6.683361,8.660947,4.406719,12.866601,slovenia toughens prevent another influx migra...,No TW hashtag,INFOS_EN,infos_en
2,0.005199,0.054732,0.017548,0.064586,-0.098017,-0.118264,0.064337,-0.206643,0.094229,0.056109,-0.056915,0.004732,-0.135487,0.067272,-0.082606,0.039617,0.043904,0.071039,0.001365,-0.011586,-0.024309,0.008415,0.089552,0.036355,0.090931,0.039351,-0.050987,0.10821,0.047569,-0.025285,-0.022014,-0.018999,-0.071872,-0.114158,-0.016979,-0.033458,-0.141668,0.02609,-0.006503,0.085516,0.06139,-0.083953,0.13304,0.02837,0.042542,-0.08237,-0.088745,-0.058993,-0.120167,0.014665,-0.012285,0.026855,0.088379,-0.027003,-0.01818,-0.038859,-0.136703,-0.08394,-0.057939,-0.061701,-0.038108,0.025639,-0.115456,0.030518,0.034313,-0.090836,-0.06553,0.031472,0.044811,0.073503,0.068742,-0.021335,0.0622,-0.026811,-0.110449,-0.011907,0.007124,0.023831,-0.01463,-0.020297,-0.012887,0.037709,0.045921,0.048095,0.030802,0.015647,0.01659,0.053384,0.020153,0.020131,0.045871,-0.038874,-0.065857,-0.04895,-0.051891,-0.051608,-0.032246,-0.0267,0.133028,0.042492,-0.003214,-0.022462,-0.027047,0.013831,-0.022598,-0.033381,-0.08086,0.003583,0.013406,-0.056463,-0.065158,-0.053378,-0.038841,-0.012207,0.060303,0.042247,0.006092,-0.015531,0.02973,0.011846,0.03378,0.041704,-0.077984,0.050165,0.045531,-0.034202,-0.029874,0.011186,0.054466,0.025762,-0.077481,-0.010189,0.076472,-0.019265,0.014876,0.028409,-0.012018,-0.060392,0.01542,0.067194,0.075958,-0.139053,0.040966,0.13757,0.039773,-0.106867,0.007835,-0.072227,0.030451,-0.111073,0.105558,-0.087569,-0.120464,-0.022189,0.035489,-0.036799,-0.059371,0.016624,-0.023623,-0.069258,-0.076452,0.107289,0.000244,0.018849,-0.014859,-0.093703,0.051143,-0.046473,-0.069047,-0.033469,-0.133656,0.003973,-0.058405,-0.031738,-0.074191,-0.087635,0.128007,-0.153895,0.042525,-0.008034,-0.155895,-0.124367,-0.050296,0.038996,0.067771,0.062253,-0.121729,0.04823,0.025213,0.061266,-0.021262,-0.049111,-0.059898,0.003396,0.070118,0.097323,-0.053295,-0.028723,-0.013056,-0.207475,-0.008062,0.039351,0.07529,-0.01738,0.010259,-0.011502,0.031938,-0.059038,-0.028576,0.012898,-0.075184,0.093439,-0.118208,-0.018684,-0.077448,-0.009721,0.083074,-0.024592,-0.034687,0.075512,-0.014105,-0.017776,-0.009144,0.106179,-0.0003,0.0179,0.040194,0.027632,0.044167,-0.026016,-0.081443,0.05876,0.018594,0.005477,0.017145,0.013505,0.017645,-0.045044,0.002663,-0.020186,-0.01712,0.030925,0.030057,-0.096258,0.002247,0.007563,-0.079512,0.086037,-0.024237,-0.051758,0.050193,0.011178,-0.044586,0.099537,0.010309,-0.032204,0.027588,0.048662,-0.049006,0.032602,0.043346,-0.022239,-0.027654,-0.031061,0.02933,0.093323,-0.053911,-0.061324,-0.000392,-0.006836,0.013627,-0.051225,0.030769,-0.009313,0.105014,-0.04339,-0.047606,-0.035955,-0.047813,0.115179,-0.0449,0.04933,0.079543,0.005708,0.033048,-0.111717,0.038752,0.030473,0.015778,-0.030789,-0.038596,0.032915,-0.11018,0.016297,-0.0583,-0.035889,-0.080788,0.029952,0.005726,0.069858,zambia power short zambia launches switch perc...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,210103078,28,0.0,6.018593,3.806662,7.714231,6.335054,zambia power short zambia launches switch perc...,No TW hashtag,BrlttanyDuguay,tw account not identified
3,-0.070496,-0.001551,-0.077881,0.125061,0.202596,-0.041077,0.050537,-0.205292,0.0342,0.017537,-0.118876,-0.142741,0.034424,-0.019897,0.102519,0.219157,0.118245,-0.034302,0.030599,0.027972,0.058065,0.136312,0.157796,-0.135579,0.094767,-0.142334,-0.097015,-0.040532,-0.020223,-0.074219,0.002706,-0.068909,-0.032359,-0.020528,0.20341,-0.074219,0.078552,0.097758,0.057946,0.165771,0.079437,-0.097453,0.174845,0.225281,0.129201,-0.096802,0.050293,-0.118368,-0.06898,-0.002686,-0.118612,0.099609,0.029439,0.228099,0.007874,-0.008179,0.047729,0.048696,-0.000885,-0.215942,-0.009237,0.056887,-0.050537,-0.186665,-0.059977,-0.068342,0.022807,-0.007584,-0.01766,0.22522,0.060791,0.058797,0.108622,0.033666,-0.103312,-0.130834,0.107117,0.112681,0.033986,0.055735,-0.105408,0.074056,0.049052,-0.076579,0.094798,-0.136882,0.01621,0.104187,0.074453,0.072632,0.029154,0.111186,-0.039352,0.014079,-0.058182,0.03125,-0.121724,0.0746,0.169434,-0.064494,-0.124512,0.011365,0.007568,-0.092753,-0.077611,-0.096069,-0.134521,-0.105184,0.115763,0.027995,-0.054891,-0.032959,-0.014099,0.035502,0.124613,-0.087016,-0.045797,0.00204,0.070488,-0.045583,0.038656,-0.009903,-0.160889,-0.077128,0.097265,-0.020915,-0.008403,0.025411,-0.044612,-0.033824,-0.143453,-0.063192,-0.092428,-0.000102,-0.209066,-0.013977,-0.039266,0.187419,-0.043783,0.120728,0.01119,-0.040365,0.111023,-0.286784,-0.128255,0.040365,-0.095551,-0.00562,0.056539,-0.190936,-0.005371,0.132161,-0.045085,0.011475,-0.024038,-0.085653,0.072774,-0.106934,-0.014242,-0.157633,-0.204854,0.016927,0.056641,0.180695,-0.082024,-0.236572,0.002462,-0.069628,-0.237071,0.213216,0.090597,-0.020101,-0.018778,-0.079753,-0.098633,0.053955,0.079926,-0.076508,0.008077,-0.011271,-0.142293,0.007563,0.107005,0.022217,0.006063,-0.016764,-0.098785,0.044454,-0.089193,0.051453,0.098857,0.079976,0.023931,-0.087646,-0.011617,-0.000203,0.058309,0.032552,-0.001856,-0.088257,-0.062032,0.119456,-0.049662,0.079915,0.050784,-0.025208,-0.073283,0.180298,-0.039795,0.034861,0.193156,0.141184,-0.047582,-0.106445,-0.056693,0.075083,0.093058,-0.074443,-0.021119,-0.083577,-0.028463,0.022491,-0.019287,0.055715,0.013306,-0.039307,0.020365,-0.064616,-0.01935,-0.006999,0.007609,-0.042175,-0.102905,-0.034368,0.130412,-0.107117,-0.043416,-0.123088,0.181396,0.131307,0.003225,-0.134766,0.122396,-0.002294,-0.174845,0.117666,0.125977,0.220133,0.028374,-0.198934,-0.02888,0.10498,0.002747,0.107747,0.037516,0.129985,-0.030093,0.038066,-0.219859,-0.089681,0.0177,0.199636,-0.054667,0.010783,0.112122,0.076131,-0.108582,0.050252,-0.087362,-0.080597,-0.143778,-0.045369,-0.000427,-0.103897,0.111145,0.015849,-0.136922,-0.013631,-0.097796,0.202738,0.046794,-0.014954,-0.104451,0.026802,-0.004313,0.150177,-0.03301,0.145004,0.081421,-0.012248,0.168376,0.052175,-0.161316,-0.130046,0.039612,-0.11617,0.040436,-0.164225,-0.055257,-0.015299,congrats deserved award winner gwgia twitter a...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,908485890,28,0.0,6.035481,3.931826,4.158883,5.252273,congrats deserved award winner gwgia twitter a...,gwgia,WomensLeadrshp,tw account not identified
4,0.04185,0.011874,0.00087,0.11636,-0.045201,-0.092346,0.055884,0.009058,0.119873,-0.034528,-0.000253,-0.003101,-0.086755,0.099731,-0.152393,0.134985,0.117554,0.075342,-0.086426,0.078662,0.074268,0.044897,0.115576,0.016534,0.02964,0.140047,-0.169946,0.011022,-0.071838,0.002721,-0.047498,-0.035748,0.051038,-0.038345,0.101862,0.010095,0.1058,0.096767,0.021497,0.070856,0.066577,0.008514,0.10769,-0.092493,-0.113377,-0.139429,0.025978,-0.01214,-0.146069,-0.004535,0.063062,-0.074904,-0.065656,0.058105,-0.072534,0.081445,-0.103766,-0.03848,-0.086,-0.166553,-0.03727,0.016122,-0.132886,-0.019372,0.008618,-0.054723,-0.001251,0.061108,-0.02677,0.100653,-0.107672,0.040771,0.101331,-0.010526,-0.137744,-0.114219,0.009644,0.028064,-0.040039,0.030164,-0.022638,-0.003406,-0.096417,0.106445,-0.003076,-0.051038,-0.107304,0.13042,-0.044934,0.015417,0.110059,-0.019588,-0.053433,-0.128015,0.002332,0.009448,0.023735,-0.088187,0.143088,-0.060966,-0.067175,-0.133057,0.044406,0.004248,0.044672,-0.068866,-0.056653,-0.03335,0.145923,-0.105157,-0.040344,-0.050293,-0.110754,0.058716,0.047339,0.098546,0.063257,-0.056067,0.181519,0.080188,0.000537,0.009302,-0.094617,-0.040088,0.176581,-0.12854,-0.035355,-0.047463,0.162439,-0.067236,-0.085254,-0.093512,-0.087659,-0.162942,-0.140771,-0.027283,-0.001599,-0.030322,0.001086,0.027148,0.065686,-0.104749,0.04259,0.01944,0.012671,-0.078979,-0.047803,0.029321,-0.103125,-0.071252,0.067676,-0.029858,-0.070752,-0.009798,0.083917,-0.068579,0.008618,-0.098071,-0.049768,-0.05849,-0.060168,0.0401,0.018506,-0.010437,-0.021631,-0.065588,-0.044489,-0.09408,0.012891,0.048911,-0.021866,0.043634,0.085156,-0.091162,-0.018781,-0.141791,0.08302,-0.021155,-0.109839,-0.057446,-0.17572,-0.088184,-0.114349,-0.023901,0.024927,-0.006714,-0.031366,0.129227,0.049738,-0.008325,0.035193,0.009375,-0.038269,-0.064307,0.016064,0.11665,-0.125525,-0.032275,0.01615,-0.08205,-0.079651,0.050598,0.045227,0.050821,-0.026855,-0.031464,-0.000159,-0.045581,-0.03999,-0.047827,0.044617,0.047632,-0.012228,0.020285,-0.078027,0.124213,0.008447,0.028583,-0.09104,-0.118811,-0.003333,0.016968,0.016754,0.023718,0.015741,-0.156323,0.073425,0.135022,-0.004651,-0.046069,-0.015863,0.04657,0.050195,0.019781,0.100684,-0.06438,0.138882,-0.066742,0.049937,-0.079776,-0.024597,-0.025159,0.001929,-0.075439,-0.010855,0.055804,0.061731,0.046924,0.102417,-0.116528,0.024963,-0.082275,-0.003894,-0.037988,0.003934,-0.005005,-0.01857,0.055762,-0.017186,0.020416,-0.054956,-0.001379,-0.050317,0.03551,-0.035742,0.145776,-0.068066,0.002206,-0.054669,0.033447,0.033691,0.174512,0.052695,-0.026328,0.026733,0.007837,-0.090417,-0.01153,0.066528,0.036597,-0.041593,-0.072046,0.111548,0.124762,-0.066577,0.050116,-0.045959,0.118604,0.167102,0.018042,0.057135,0.109727,0.022656,-0.044318,0.007727,-0.011847,0.072589,-0.049911,-0.041998,-0.089526,dozen attorneys general defend subsidy payment...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,701086584,21,0.0,5.786897,2.397895,4.406719,5.347108,dozen attorneys general defend subsidy payment...,"insurance, obamacare",StephOliverTR5,tw account not identified


In [36]:
main_all_vectors2.shape

(19848, 351)

#### Transform 'All Impact' variable in 3 bins as defined in the Step 4
> * In order to preserve the integrity of Master Test variables and stay coherent with the modeling approach, we will perform a bin transformation with the same bin width as defined in the Step 4
* Bins range are: [ 0., 30., 41., 80.]
* Remove not relevant features and save the dataframe to a csv file

In [37]:
main_all_vectors2["All_impact_bins"] = pd.cut(
    x=main_all_vectors2["ALL_Impact"], bins=[0, 30, 41, 80], labels=[0, 1, 2]
)

In [38]:
main_all_vectors2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,Title,Sentiment,Company_Clarivate,Company_Informa,Company_Pearson,Company_RELX Group,Company_Thomson Reuters,Company_Wolters Kluwer,Country 2_Argentina,Country 2_Australia,Country 2_Belgium,Country 2_Brazil,Country 2_Canada,Country 2_Ecuador,Country 2_France,Country 2_Germany,Country 2_Hong Kong,Country 2_India,Country 2_Italy,Country 2_Japan,Country 2_Mexico,Country 2_Netherlands,Country 2_Other,Country 2_Philippines,Country 2_Russia,Country 2_Serbia,Country 2_Singapore,Country 2_South Africa,Country 2_Spain,Country 2_Switzerland,Country 2_United Arab Emirates,Country 2_United Kingdom,Country 2_United States,Country 2_Venezuela,ALL_Thread_Entry_Type_post,ALL_Thread_Entry_Type_reply,ALL_Thread_Entry_Type_share,TW_Account_Type_Not identified,TW_Account_Type_individual,TW_Account_Type_organisational,Post_ID,ALL_Impact,Log_TW_KredOutreach,Log_Nbreach,Log_TW_NbFollowers,Log_TW_NbFollowing,Log_TW_NbTweets,Snippet,TW_Hashtags,ALL_Author,TW_Account_Name,All_impact_bins
0,-0.138428,-0.001816,-0.094147,-0.06778,0.02597,0.033813,0.153992,0.030151,0.143555,-0.002014,-0.060566,-0.019653,-0.105133,0.110291,-0.145081,0.074402,0.076069,0.164062,0.085945,-0.117432,-0.030182,0.036865,0.077515,0.047623,-0.047241,-0.006287,-0.124878,0.029999,-0.014572,0.034546,0.074783,-0.128784,0.033325,-0.019424,-0.014988,-0.050194,0.038574,0.126709,0.039429,-0.017273,-0.025116,0.051147,0.066839,0.020996,-0.204346,-0.10446,0.114651,0.086975,0.012146,0.053604,0.069885,-0.046417,-0.036331,0.003418,0.001617,0.047729,-0.13858,-0.00885,0.042419,-0.139648,-0.076782,-0.106018,0.070251,0.004883,-0.06382,-0.006165,-0.002777,0.13855,-0.02124,-0.050659,-0.080811,-0.001167,0.229248,0.077187,-0.018524,-0.099266,0.133667,0.09082,-0.006561,0.005432,-0.046799,-0.053297,-0.08551,0.131714,-0.066681,0.065979,-0.113937,0.167267,-0.072144,-0.023148,0.364502,-0.130798,0.001923,-0.012207,0.0224,-0.132309,0.091675,-0.052185,0.148911,0.051208,0.023056,0.094238,-0.066589,0.051221,-0.082222,-0.156311,-0.048859,0.025757,0.098572,-0.231934,0.003052,0.028931,-0.167236,0.02356,0.130112,-0.03183,0.011963,-0.057617,0.189087,0.021606,-0.033417,0.081055,-0.078444,-0.006195,-0.038208,-0.072754,-0.16748,-0.033936,-0.050157,0.010376,0.132202,-0.013321,-0.171143,-0.048309,0.03598,-0.074005,0.004211,0.035095,0.057941,0.011475,-0.026855,-0.011841,-0.084412,0.116455,-0.064774,-0.078735,0.200439,0.014816,-0.117486,-0.029541,0.147957,0.024048,-0.075333,-0.042768,-0.109776,-0.041073,-0.059814,-0.21283,-0.122742,-0.10221,0.015442,-0.016171,0.127441,0.037109,0.099136,0.062531,0.070763,-0.082703,0.13578,-0.108429,-0.156799,-0.030884,0.082245,-0.073799,0.073853,-0.093262,0.139313,0.01886,-0.042843,0.139038,-0.19986,-0.13855,-0.013519,-0.049561,0.004272,-0.010231,0.006226,0.125061,0.044098,0.145905,-0.013123,-0.03363,-0.064331,0.030533,-0.118195,0.02383,-0.118866,-0.120972,0.137676,-0.098999,-0.089844,0.010254,0.087921,-0.083313,-0.110107,-0.065796,-0.004395,-0.071716,-0.039185,-0.039413,-0.014481,-0.017151,-0.034546,0.071167,-0.015137,0.06958,0.155396,0.026917,-0.126953,-0.016541,-0.164444,-0.009003,0.036316,0.057556,0.093384,-0.087158,0.12146,-0.057526,-0.021362,0.159332,0.107956,0.012024,0.031433,-0.019339,0.130859,-0.035522,0.098755,-0.062256,0.05307,-0.049805,-0.021805,-0.073044,-0.030945,0.055115,-0.045654,-0.131042,-0.096313,-0.015869,0.067871,-0.109314,0.003296,0.057434,0.036362,0.076416,-0.047974,-0.226807,-0.003746,0.040283,-0.007446,-0.019897,-0.073853,-0.142334,-0.071121,0.067749,0.035118,0.180664,-0.068176,-0.058487,-0.05603,-0.070667,-0.078369,0.092346,0.084152,-0.073486,-0.049774,-0.085266,0.000381,-0.095215,-0.049133,-0.042297,-0.014374,0.011078,0.079987,0.151772,-0.042557,0.021179,-0.068787,0.054382,0.088135,0.032166,-0.003906,0.018005,-0.00589,-0.069153,0.023254,0.140564,-0.025146,-0.038391,0.123535,0.072601,legal education affect market tmsnrt,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,883320256,63,0.0,7.207119,7.333676,6.716595,8.324821,legal education affect market tmsnrt,No TW hashtag,TRLegalUKI,tw account not identified,2
1,0.01223,0.079514,-0.024948,-0.031738,-0.101898,-0.053185,-0.04071,-0.025894,-0.07827,0.042047,0.035556,-0.170837,-0.034592,0.095093,-0.148304,0.000458,-0.001228,0.114738,-0.07196,-0.103394,0.087502,0.035852,0.094818,-0.01947,0.055298,-0.126808,0.026001,0.11441,0.000872,0.006126,0.010757,-0.073456,-0.065178,-0.127716,-0.040588,-0.105072,-0.075645,0.09375,0.04781,0.11412,0.007721,-0.163528,0.166695,0.061432,0.020233,-0.179771,-0.082367,0.088745,-0.02771,-0.002403,0.022614,0.084351,-0.013672,-0.076481,-0.00956,-0.091282,-0.114663,0.004074,-0.034836,-0.152885,-0.063278,-0.075369,-0.088135,-0.075325,0.026508,-0.046631,-0.075539,-0.033905,-0.035309,0.126303,-0.004669,0.010712,0.093079,0.099464,-0.136566,-0.00169,0.057037,0.110565,0.022949,0.093224,0.014557,-0.015671,-0.064545,-0.046052,-0.018133,0.042114,-0.068008,0.077797,0.02658,-0.032639,0.087334,0.035141,-0.009071,-0.062271,0.06189,-0.038643,0.065968,-0.002701,0.205261,-0.013199,-0.082367,-0.022415,-0.036636,0.044029,-0.047933,-0.026909,-0.030411,-0.025757,0.077404,-0.021263,-0.041473,-0.01433,0.044418,-0.053146,0.006424,0.009476,-0.037888,0.034901,0.00589,0.135803,-0.023117,0.032211,-0.088371,0.010429,0.080292,-0.149841,-0.092438,0.08107,0.079178,-0.011711,0.023178,-0.062294,-0.017303,0.01297,-0.068413,-0.047974,0.02672,0.012559,0.048462,0.036625,0.112793,-0.072754,0.084969,0.172897,0.045563,-0.027542,0.009449,-0.00293,-0.024445,-0.090637,0.200836,-0.108948,-0.104984,-0.011444,0.07209,-0.060364,-0.057663,-0.031258,-0.014954,0.00975,0.003052,-0.059662,-0.05825,-0.186882,0.045441,-0.041641,0.026895,-0.153259,0.012329,0.08934,-0.14492,0.077637,-0.080582,0.014709,0.085083,0.033218,0.149462,-0.019337,0.039932,0.018921,-0.103668,-0.12178,0.050858,-0.096069,-0.008972,0.075195,-0.059399,0.08149,0.082214,-0.00016,0.015419,0.003372,-0.041994,0.058716,-0.103247,0.02133,-0.108712,0.001495,-0.05101,-0.089956,0.142014,0.157181,-0.000557,-0.045532,-0.044365,0.046703,0.073212,-0.031349,0.054779,-0.059875,0.01236,0.040222,-0.094055,-0.011932,-0.080734,0.020987,0.093311,0.016418,-0.059048,0.070557,-0.148712,0.011385,0.022629,0.045837,0.007473,-0.148666,0.096436,0.067932,0.033939,0.10321,-0.010032,0.069897,0.020805,0.038635,0.003342,-0.096161,0.056702,-0.067032,0.004288,-0.058086,0.044281,0.044708,0.031986,0.025101,0.00193,0.027084,0.031891,0.091881,-0.02388,-0.144287,0.128983,0.004967,-0.064434,0.138817,0.008114,0.01001,0.104889,-0.081482,-0.075623,0.030701,0.081711,0.040955,0.037048,0.042625,0.041931,0.025215,-0.08667,-0.118126,-0.046005,-0.174194,0.059845,0.058914,0.035051,-0.004566,-0.004944,-0.007263,-0.016983,-0.08506,-0.007736,0.025848,-0.107086,0.055332,-0.070644,0.076895,-0.023941,0.016251,-0.081665,0.080032,0.063677,0.079437,-0.05072,-0.001709,0.004986,0.00108,0.019249,0.150398,-0.107834,-0.02179,0.008972,0.042465,slovenia toughens prevent another influx migra...,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,747534914,53,2.079442,6.683361,8.660947,4.406719,12.866601,slovenia toughens prevent another influx migra...,No TW hashtag,INFOS_EN,infos_en,2
2,0.005199,0.054732,0.017548,0.064586,-0.098017,-0.118264,0.064337,-0.206643,0.094229,0.056109,-0.056915,0.004732,-0.135487,0.067272,-0.082606,0.039617,0.043904,0.071039,0.001365,-0.011586,-0.024309,0.008415,0.089552,0.036355,0.090931,0.039351,-0.050987,0.10821,0.047569,-0.025285,-0.022014,-0.018999,-0.071872,-0.114158,-0.016979,-0.033458,-0.141668,0.02609,-0.006503,0.085516,0.06139,-0.083953,0.13304,0.02837,0.042542,-0.08237,-0.088745,-0.058993,-0.120167,0.014665,-0.012285,0.026855,0.088379,-0.027003,-0.01818,-0.038859,-0.136703,-0.08394,-0.057939,-0.061701,-0.038108,0.025639,-0.115456,0.030518,0.034313,-0.090836,-0.06553,0.031472,0.044811,0.073503,0.068742,-0.021335,0.0622,-0.026811,-0.110449,-0.011907,0.007124,0.023831,-0.01463,-0.020297,-0.012887,0.037709,0.045921,0.048095,0.030802,0.015647,0.01659,0.053384,0.020153,0.020131,0.045871,-0.038874,-0.065857,-0.04895,-0.051891,-0.051608,-0.032246,-0.0267,0.133028,0.042492,-0.003214,-0.022462,-0.027047,0.013831,-0.022598,-0.033381,-0.08086,0.003583,0.013406,-0.056463,-0.065158,-0.053378,-0.038841,-0.012207,0.060303,0.042247,0.006092,-0.015531,0.02973,0.011846,0.03378,0.041704,-0.077984,0.050165,0.045531,-0.034202,-0.029874,0.011186,0.054466,0.025762,-0.077481,-0.010189,0.076472,-0.019265,0.014876,0.028409,-0.012018,-0.060392,0.01542,0.067194,0.075958,-0.139053,0.040966,0.13757,0.039773,-0.106867,0.007835,-0.072227,0.030451,-0.111073,0.105558,-0.087569,-0.120464,-0.022189,0.035489,-0.036799,-0.059371,0.016624,-0.023623,-0.069258,-0.076452,0.107289,0.000244,0.018849,-0.014859,-0.093703,0.051143,-0.046473,-0.069047,-0.033469,-0.133656,0.003973,-0.058405,-0.031738,-0.074191,-0.087635,0.128007,-0.153895,0.042525,-0.008034,-0.155895,-0.124367,-0.050296,0.038996,0.067771,0.062253,-0.121729,0.04823,0.025213,0.061266,-0.021262,-0.049111,-0.059898,0.003396,0.070118,0.097323,-0.053295,-0.028723,-0.013056,-0.207475,-0.008062,0.039351,0.07529,-0.01738,0.010259,-0.011502,0.031938,-0.059038,-0.028576,0.012898,-0.075184,0.093439,-0.118208,-0.018684,-0.077448,-0.009721,0.083074,-0.024592,-0.034687,0.075512,-0.014105,-0.017776,-0.009144,0.106179,-0.0003,0.0179,0.040194,0.027632,0.044167,-0.026016,-0.081443,0.05876,0.018594,0.005477,0.017145,0.013505,0.017645,-0.045044,0.002663,-0.020186,-0.01712,0.030925,0.030057,-0.096258,0.002247,0.007563,-0.079512,0.086037,-0.024237,-0.051758,0.050193,0.011178,-0.044586,0.099537,0.010309,-0.032204,0.027588,0.048662,-0.049006,0.032602,0.043346,-0.022239,-0.027654,-0.031061,0.02933,0.093323,-0.053911,-0.061324,-0.000392,-0.006836,0.013627,-0.051225,0.030769,-0.009313,0.105014,-0.04339,-0.047606,-0.035955,-0.047813,0.115179,-0.0449,0.04933,0.079543,0.005708,0.033048,-0.111717,0.038752,0.030473,0.015778,-0.030789,-0.038596,0.032915,-0.11018,0.016297,-0.0583,-0.035889,-0.080788,0.029952,0.005726,0.069858,zambia power short zambia launches switch perc...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,210103078,28,0.0,6.018593,3.806662,7.714231,6.335054,zambia power short zambia launches switch perc...,No TW hashtag,BrlttanyDuguay,tw account not identified,0
3,-0.070496,-0.001551,-0.077881,0.125061,0.202596,-0.041077,0.050537,-0.205292,0.0342,0.017537,-0.118876,-0.142741,0.034424,-0.019897,0.102519,0.219157,0.118245,-0.034302,0.030599,0.027972,0.058065,0.136312,0.157796,-0.135579,0.094767,-0.142334,-0.097015,-0.040532,-0.020223,-0.074219,0.002706,-0.068909,-0.032359,-0.020528,0.20341,-0.074219,0.078552,0.097758,0.057946,0.165771,0.079437,-0.097453,0.174845,0.225281,0.129201,-0.096802,0.050293,-0.118368,-0.06898,-0.002686,-0.118612,0.099609,0.029439,0.228099,0.007874,-0.008179,0.047729,0.048696,-0.000885,-0.215942,-0.009237,0.056887,-0.050537,-0.186665,-0.059977,-0.068342,0.022807,-0.007584,-0.01766,0.22522,0.060791,0.058797,0.108622,0.033666,-0.103312,-0.130834,0.107117,0.112681,0.033986,0.055735,-0.105408,0.074056,0.049052,-0.076579,0.094798,-0.136882,0.01621,0.104187,0.074453,0.072632,0.029154,0.111186,-0.039352,0.014079,-0.058182,0.03125,-0.121724,0.0746,0.169434,-0.064494,-0.124512,0.011365,0.007568,-0.092753,-0.077611,-0.096069,-0.134521,-0.105184,0.115763,0.027995,-0.054891,-0.032959,-0.014099,0.035502,0.124613,-0.087016,-0.045797,0.00204,0.070488,-0.045583,0.038656,-0.009903,-0.160889,-0.077128,0.097265,-0.020915,-0.008403,0.025411,-0.044612,-0.033824,-0.143453,-0.063192,-0.092428,-0.000102,-0.209066,-0.013977,-0.039266,0.187419,-0.043783,0.120728,0.01119,-0.040365,0.111023,-0.286784,-0.128255,0.040365,-0.095551,-0.00562,0.056539,-0.190936,-0.005371,0.132161,-0.045085,0.011475,-0.024038,-0.085653,0.072774,-0.106934,-0.014242,-0.157633,-0.204854,0.016927,0.056641,0.180695,-0.082024,-0.236572,0.002462,-0.069628,-0.237071,0.213216,0.090597,-0.020101,-0.018778,-0.079753,-0.098633,0.053955,0.079926,-0.076508,0.008077,-0.011271,-0.142293,0.007563,0.107005,0.022217,0.006063,-0.016764,-0.098785,0.044454,-0.089193,0.051453,0.098857,0.079976,0.023931,-0.087646,-0.011617,-0.000203,0.058309,0.032552,-0.001856,-0.088257,-0.062032,0.119456,-0.049662,0.079915,0.050784,-0.025208,-0.073283,0.180298,-0.039795,0.034861,0.193156,0.141184,-0.047582,-0.106445,-0.056693,0.075083,0.093058,-0.074443,-0.021119,-0.083577,-0.028463,0.022491,-0.019287,0.055715,0.013306,-0.039307,0.020365,-0.064616,-0.01935,-0.006999,0.007609,-0.042175,-0.102905,-0.034368,0.130412,-0.107117,-0.043416,-0.123088,0.181396,0.131307,0.003225,-0.134766,0.122396,-0.002294,-0.174845,0.117666,0.125977,0.220133,0.028374,-0.198934,-0.02888,0.10498,0.002747,0.107747,0.037516,0.129985,-0.030093,0.038066,-0.219859,-0.089681,0.0177,0.199636,-0.054667,0.010783,0.112122,0.076131,-0.108582,0.050252,-0.087362,-0.080597,-0.143778,-0.045369,-0.000427,-0.103897,0.111145,0.015849,-0.136922,-0.013631,-0.097796,0.202738,0.046794,-0.014954,-0.104451,0.026802,-0.004313,0.150177,-0.03301,0.145004,0.081421,-0.012248,0.168376,0.052175,-0.161316,-0.130046,0.039612,-0.11617,0.040436,-0.164225,-0.055257,-0.015299,congrats deserved award winner gwgia twitter a...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,908485890,28,0.0,6.035481,3.931826,4.158883,5.252273,congrats deserved award winner gwgia twitter a...,gwgia,WomensLeadrshp,tw account not identified,0
4,0.04185,0.011874,0.00087,0.11636,-0.045201,-0.092346,0.055884,0.009058,0.119873,-0.034528,-0.000253,-0.003101,-0.086755,0.099731,-0.152393,0.134985,0.117554,0.075342,-0.086426,0.078662,0.074268,0.044897,0.115576,0.016534,0.02964,0.140047,-0.169946,0.011022,-0.071838,0.002721,-0.047498,-0.035748,0.051038,-0.038345,0.101862,0.010095,0.1058,0.096767,0.021497,0.070856,0.066577,0.008514,0.10769,-0.092493,-0.113377,-0.139429,0.025978,-0.01214,-0.146069,-0.004535,0.063062,-0.074904,-0.065656,0.058105,-0.072534,0.081445,-0.103766,-0.03848,-0.086,-0.166553,-0.03727,0.016122,-0.132886,-0.019372,0.008618,-0.054723,-0.001251,0.061108,-0.02677,0.100653,-0.107672,0.040771,0.101331,-0.010526,-0.137744,-0.114219,0.009644,0.028064,-0.040039,0.030164,-0.022638,-0.003406,-0.096417,0.106445,-0.003076,-0.051038,-0.107304,0.13042,-0.044934,0.015417,0.110059,-0.019588,-0.053433,-0.128015,0.002332,0.009448,0.023735,-0.088187,0.143088,-0.060966,-0.067175,-0.133057,0.044406,0.004248,0.044672,-0.068866,-0.056653,-0.03335,0.145923,-0.105157,-0.040344,-0.050293,-0.110754,0.058716,0.047339,0.098546,0.063257,-0.056067,0.181519,0.080188,0.000537,0.009302,-0.094617,-0.040088,0.176581,-0.12854,-0.035355,-0.047463,0.162439,-0.067236,-0.085254,-0.093512,-0.087659,-0.162942,-0.140771,-0.027283,-0.001599,-0.030322,0.001086,0.027148,0.065686,-0.104749,0.04259,0.01944,0.012671,-0.078979,-0.047803,0.029321,-0.103125,-0.071252,0.067676,-0.029858,-0.070752,-0.009798,0.083917,-0.068579,0.008618,-0.098071,-0.049768,-0.05849,-0.060168,0.0401,0.018506,-0.010437,-0.021631,-0.065588,-0.044489,-0.09408,0.012891,0.048911,-0.021866,0.043634,0.085156,-0.091162,-0.018781,-0.141791,0.08302,-0.021155,-0.109839,-0.057446,-0.17572,-0.088184,-0.114349,-0.023901,0.024927,-0.006714,-0.031366,0.129227,0.049738,-0.008325,0.035193,0.009375,-0.038269,-0.064307,0.016064,0.11665,-0.125525,-0.032275,0.01615,-0.08205,-0.079651,0.050598,0.045227,0.050821,-0.026855,-0.031464,-0.000159,-0.045581,-0.03999,-0.047827,0.044617,0.047632,-0.012228,0.020285,-0.078027,0.124213,0.008447,0.028583,-0.09104,-0.118811,-0.003333,0.016968,0.016754,0.023718,0.015741,-0.156323,0.073425,0.135022,-0.004651,-0.046069,-0.015863,0.04657,0.050195,0.019781,0.100684,-0.06438,0.138882,-0.066742,0.049937,-0.079776,-0.024597,-0.025159,0.001929,-0.075439,-0.010855,0.055804,0.061731,0.046924,0.102417,-0.116528,0.024963,-0.082275,-0.003894,-0.037988,0.003934,-0.005005,-0.01857,0.055762,-0.017186,0.020416,-0.054956,-0.001379,-0.050317,0.03551,-0.035742,0.145776,-0.068066,0.002206,-0.054669,0.033447,0.033691,0.174512,0.052695,-0.026328,0.026733,0.007837,-0.090417,-0.01153,0.066528,0.036597,-0.041593,-0.072046,0.111548,0.124762,-0.066577,0.050116,-0.045959,0.118604,0.167102,0.018042,0.057135,0.109727,0.022656,-0.044318,0.007727,-0.011847,0.072589,-0.049911,-0.041998,-0.089526,dozen attorneys general defend subsidy payment...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,701086584,21,0.0,5.786897,2.397895,4.406719,5.347108,dozen attorneys general defend subsidy payment...,"insurance, obamacare",StephOliverTR5,tw account not identified,0


In [39]:
# Check the bin values
main_all_vectors2["All_impact_bins"].unique()

[2, 0, 1, NaN]
Categories (3, int64): [0 < 1 < 2]

In [40]:
# Check the number of NaN after transformation
main_all_vectors2.isnull().sum()  # 2 data points have not been assigned to classes. We will remove them as it will not impact the modeling

0                  0
1                  0
2                  0
3                  0
4                  0
                  ..
Snippet            0
TW_Hashtags        0
ALL_Author         0
TW_Account_Name    0
All_impact_bins    1
Length: 352, dtype: int64

In [41]:
df_modeling = main_all_vectors2.dropna(subset=["All_impact_bins"])

In [42]:
df_modeling.shape

(19847, 352)

In [43]:
# Check the distribution of data points
df_modeling[
    "All_impact_bins"
].value_counts()  # Split of data is almost equally distributed as the principle taken in the previous modeling phases

0    6872
1    6682
2    6293
Name: All_impact_bins, dtype: int64

In [44]:
# Remove not relevant features to match the final
final_modeling = df_modeling.drop(
    [
        "Post_ID",
        "ALL_Impact",
        "Snippet",
        "TW_Hashtags",
        "ALL_Author",
        "TW_Account_Name",
        "Title",
    ],
    axis=1,
)

In [45]:
final_modeling.shape

(19847, 345)

In [46]:
# Create a csv file
# final_modeling.to_csv('Final_modeling.csv')