In [2]:
import pandas as pd
import re

In [3]:
features = pd.read_csv(r"new_data/features.csv", engine='python')
features

Unnamed: 0,0
0,aa
1,aaah
2,aah
3,aaron
4,ab
...,...
4995,zoe
4996,zombie
4997,zombies
4998,zone


In [4]:
X_features = features[features.columns[0]]
features_list = X_features.values.tolist()
features_list[:10]

['aa',
 'aaah',
 'aah',
 'aaron',
 'ab',
 'abandon',
 'abby',
 'abc',
 'ability',
 'abit']

In [5]:
import string

def remove_punctuation(tweet):
    translator = str.maketrans('', '', string.punctuation)
    return tweet.translate(translator)

In [6]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))

def remove_stopwords(post):
    tokens = word_tokenize(post)
    tokens_nostop = [w for w in tokens if w not in stop_words]
    return tokens_nostop

[nltk_data] Downloading package stopwords to /Users/Shiqi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def preprocess(path):
    preprocessed = []
    
    tweets = pd.read_csv(path, engine="python")
    tweets = tweets.drop(tweets.columns[0], axis=1)
    X_tweets = tweets[tweets.columns[0]]
    tweets_list = X_tweets.values.tolist()
    
    tweets_lower = [t.lower() for t in tweets_list]
    for tweet in tweets_lower:
        tweets_nonum = re.sub(r'\d+', '', tweet)
        tweets_nopunc = remove_punctuation(tweets_nonum)
        tweets_nospace = " ".join(tweets_nopunc.split())
        preprocessed.append(tweets_nospace)
        
    for i in preprocessed[:3]:
        print(i, '\n')
        
    tokens_list = [remove_stopwords(p) for p in preprocessed]
    print(tokens_list[:3])
    return tokens_list

In [8]:
def get_matrix(features_list, tokens_list):
    matrix = []
    for tokens in tokens_list:
        l = []
        for f in features_list:
            l.append(tokens.count(f))
        matrix.append(l)
    return matrix

## Trump

In [44]:
trump_tokens_list = preprocess("dataset/Trump.csv")

if you tell me a being of pure white light speaks to donald j trump ill wonder if they were elohim 

thedemocrats in dont vote for trump he will start new wars thedemocrats in dont vote for… httpstcoqcjcbxxwfd 

awesometweet im sure trump will find a way to compliment himself 

[['tell', 'pure', 'white', 'light', 'speaks', 'donald', 'j', 'trump', 'ill', 'wonder', 'elohim'], ['thedemocrats', 'dont', 'vote', 'trump', 'start', 'new', 'wars', 'thedemocrats', 'dont', 'vote', 'for…', 'httpstcoqcjcbxxwfd'], ['awesometweet', 'im', 'sure', 'trump', 'find', 'way', 'compliment']]


In [46]:
trump_matrix = get_matrix(features_list, trump_tokens_list)

In [47]:
trump_matrix_df = pd.DataFrame(trump_matrix)

In [49]:
trump_matrix_df.to_csv('X_trump.csv', index=False)

## Coronavirus

In [51]:
corona_tokens_list = preprocess("dataset/coronavirus.csv")
corona_matrix = get_matrix(features_list, corona_tokens_list)
corona_matrix_df = pd.DataFrame(corona_matrix)
corona_matrix_df.to_csv('X_coronavirus.csv', index=False)

realjameswoods last hours our potus ✅ huge trump rally in charleston plus ✅ conveens emergency… httpstcoevdvylzpg 

secazar we the people are not paying to get treated for the coronavirus i’m reading reports americans are recei… httpstcopovpmxg 

as of confirmed cases in areas outside prc are picking up speed ncov coronavirus covid… httpstcogbgcoqpsw 

[['realjameswoods', 'last', 'hours', 'potus', '✅', 'huge', 'trump', 'rally', 'charleston', 'plus', '✅', 'conveens', 'emergency…', 'httpstcoevdvylzpg'], ['secazar', 'people', 'paying', 'get', 'treated', 'coronavirus', '’', 'reading', 'reports', 'americans', 'recei…', 'httpstcopovpmxg'], ['confirmed', 'cases', 'areas', 'outside', 'prc', 'picking', 'speed', 'ncov', 'coronavirus', 'covid…', 'httpstcogbgcoqpsw']]


## MeToo

In [9]:
metoo_tokens_list = preprocess("dataset/Metoomovement.csv")
metoo_matrix = get_matrix(features_list, metoo_tokens_list)
metoo_matrix_df = pd.DataFrame(metoo_matrix)
metoo_matrix_df.to_csv('X_metoo.csv', index=False)

aerynthrace the latest metoomovement httpstcouxnypvhdbl httpstcoyaplxvim 

miriamsved involved in a wonderful discussion about the metoomovement with latrioli yesterday when asked if t… httpstcospqahwjns 

fawazar that show talks about media and the metoomovement in a very interesting way wellreceived 

[['aerynthrace', 'latest', 'metoomovement', 'httpstcouxnypvhdbl', 'httpstcoyaplxvim'], ['miriamsved', 'involved', 'wonderful', 'discussion', 'metoomovement', 'latrioli', 'yesterday', 'asked', 't…', 'httpstcospqahwjns'], ['fawazar', 'show', 'talks', 'media', 'metoomovement', 'interesting', 'way', 'wellreceived']]
