In [41]:
import os
import datetime
import json
import twython
import json
import csv
import re

In [2]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
import nltk
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
import numpy as np 

In [4]:
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [5]:
def reset(df):
    """
    pass in dataframe and returns dataframe with index reset
    just wanted to make it a bit of a quicker type since I'm using it so much 
    """
    return df.reset_index(drop=True, inplace=True)

def remove_tweets(df, column, list_to_remove):
    """
    Pass in your dataframe, a column & a list of words to search for and remove.
    Does not change capitalization, but will remove if word is within another word.
    Returns dataframe with words removed. 
    """
    for item in list_to_remove:
        reset(df)
        for i in range(len(df)):
            if type(df.at[i, column]) == str:
                col = df.at[i, column]
                if item in col or item in col.lower:
                    df.drop(index=i, inplace=True)
    return df 

def remove_phrase(df, column, dict_to_remove):
    """
    Pass in your dataframe, a column & a dictionary with lowercase key value pairs of words to search for and remove.
    Will change capitalization & remove if words are within other words.
    Returns dataframe with words removed. 
    """
    for key in dict_to_remove.keys():
        reset(df)
        value = dict_to_remove[key].lower()
        key = key.lower()
        for i in range(len(df)):
            if type(df.at[i, column]) == str:
                if key in df.at[i, column] and value in df.at[i, column]:
                    df.drop(index=i, inplace=True)
    return df 

def clean_string(string):
    for symbol in "'‚Äô":
        string = string.replace(symbol, '')
    for symbol in "`@#();-=+~:,.?!''\n/_\\":
        string = string.replace(symbol, ' ').lower()
    string = string.replace('&', 'and')
    return string

def count_vectorize(text):
    unique_words = set(text)
    word_dict = {i:0 for i in unique_words}
    
    for word in text:
        word_dict[word] += 1
    
    return word_dict



In [6]:
df = pd.read_csv('data/3.26.twitter')
df = df.append(pd.read_csv('data/tweets_matt.csv', low_memory=False))

In [7]:
df = df[['user', 'user_id', 'text', 'favorite_count', 'symbols',
       'retweet_count', 'mentions', 'hashtag', 'post_id']]
reset(df)

In [8]:
# since every time we loop through it starts a new index, there are now multiple entries for each index, we need to fix this
# and the easiest way is to just reset the index once it finishes running 
df = df.drop_duplicates(subset='post_id')
reset(df)

In [9]:
# in order to add our strings for our hashtags and mentions together we had to make the entire column a string, sinlucding empty cells
# here we go through and replace empty cells with nan values so pandas will read them as being empty instead of a string
for i in range(len(df)):
    if df.at[i, 'mentions'] == ' ':
        df.at[i, 'mentions'] = np.nan
    else: 
        pass 
    if df.at[i, 'hashtag'] == ' ':
        df.at[i, 'hashtag'] = np.nan
    else: 
        pass 

In [10]:
df['favorite_count'] = df['favorite_count'].astype(float).round(0)

In [11]:
df = df.dropna(subset=['text'])

In [12]:
df = df.sort_values(by='favorite_count', ascending=False).drop_duplicates(subset='text', keep='first')
reset(df)
df['text'] = df['text'].str.lower()

In [13]:
for i in range(len(df)):
    df.at[i, 'text'] = df.at[i, 'text'].replace('RT', '')

In [15]:
hashtags = ['WandaVision', 'findyourthing', 'gangsters', 'RBandME', 'music', 'homework', 'Termpaper', 'newmusic', 
            'nowplaying', 'fridaylivestream',  'BTSpace', 'CRAVITY',  'Bitcoin', 'StanWorld', 'BecomeOneForIZONE', 'ovni', 
            'Colchester', 'NowPlaying', 'IZONE_PERMANENT', 'Onlineclass', 'WorldBookDay', 'SatyamevaJayate2', 'izone_permanent',
            'SnyderCut', 'ÎßàÎßàÎ¨¥', 'MAMAMOO',  'EVA71', 'ÌïòÍ≤å', 'OurParallelUniverseContinues', 'ÿ¨€åŸà_ÿ™Ÿà_ÿπ€åÿ≥€åŸ∞_⁄©€å_ÿ∑ÿ±ÿ≠', 'dogecoin', 
            'ShowtimeBetAngMalupet', 'DidYouKnow', 'VoteHarryStyles', 'AMNùóòùó¶ùóúùóî',  'GRAMMYs', 'etsy', 'MyanmarMilitaryTerrorists',
            'Poshmark', 'StPatricksDay', 'MindBreeze', 'ad', 'ArtOfTheBlue', '‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ‡§≠‡§æ‡§à_‡§∏‡•Å‡§®‡•ã‡§Ö‡§≤‡•ç‡§≤‡§æ‡§π‡§ï‡•Ä‡§∏‡§ö‡•ç‡§ö‡§æ‡§à', 'Essaydue', 'BigData', 
            'Aylesbury',  'PiDay', 'Harpenden',  'SoundCloud', 'Dogecoin', 'doge',  'ÿπŸÖÿ±ÿßŸÜ_ŸÜ€åÿßÿ≤€å_⁄ØŸπÿ±_⁄©ÿß_⁄©€å⁄ëÿß', 'ZackSnydersJusticeLeague',
            'NFT',  'ifttt', 'Shopee33Comeback', 'ÏõêÏñ¥Ïä§',  'Ïù¥ÎèÑ', 'ÏÜîÎùº_ÎπàÏÑºÏ°∞ost_Adrenaline', 'HadiahLightstickDariShopee', 
            '3Ïõî24Ïùº_Ï∞¨Ïó¥_ÎçîÎ∞ïÏä§_Í∞úÎ¥â',  'ÏóëÏÜå',  'ÏàòÌò∏',  'ÎîîÏò§', 'ÏãúÏö∞ÎØº', 'ÏàòÌò∏', 'BanglaChaayeBJPModel','CHANYEOL','WeLoveYouBaekhyun',
            'OnXiuweetTimeAtHome', 'Ï∞¨Ïó¥', 'NSFW', 'nsfw', '‡§Æ‡§π‡§∞‡•ç‡§∑‡§ø‡§¶‡§Ø‡§æ‡§®‡§Ç‡§¶_‡§ï‡§æ_‡§Ö‡§ú‡•ç‡§û‡§æ‡§®','DollWithBaekhyun',  'BAEKHYUN', 'XIUMIN', 'BCU_RYS21', 
            'OprahMeghanHarry', 'AuspiXius', 'SUHO', 'DollWithBBHxKDY', 'iCANimagine', 'thewildsspace', 'XiuweetTimeWithYou', 'DYK']

users = ['artemis_twt']

text = ['esa_celebnews', 'superstraight', 'seekthetruth', 'izone', 'tarotbybronx', 'cryptoart', 'nsfw', 'meme king', 'minecraft', 
        'artemis and luna', 'brasileiro', 'myanmar coup', 'baekhyun', 'doyoung', 'band', 'kpop', 'cuddles']

dict_to_remove = {'bruno': 'mars', 'space':'jam'}

In [16]:
dict_of_lists = {'user' : users, 'hashtag' : hashtags, 'text': text}

for key in dict_of_lists:
    df = remove_tweets(df, key, dict_of_lists[key])

In [17]:
#unreasonably computationally expensive? if not nessecary, do not run again
df = remove_phrase(df, 'text', dict_to_remove)

In [18]:
count = 0 
empty_list = []

for item in df.hashtag.value_counts().to_frame().reset_index()['index']:
    if count < 250:
        hashtags = str(item).split()
        for item in hashtags:
            empty_list.append(item)
            count = count + 1

items = ['jaxa', 'esa', 'curiosityrover', 'areospace', 'internationalspacestation', 'JAXA', 'astronomy',
         'oppertunityrover', 'virgingalactic', 'universe', 'sls', 'Starship', 'climate', 'starship', 'virginorbit', 
         'nasa', 'cosmos', 'mars', 'falconheavy', 'NASA', 'futurism', 'starliner', 'iss', 'spacex', 'falcon9', 
         'nasa_app', 'roscosmos', 'Roscosmos', 'blueorigin', 'ESA', 'spacetravel', 'artemis', 'marswebcam', 'starlink']

fresh_hashtags = []

for item in empty_list:
    if item not in items:
        fresh_hashtags.append(item)
        
print(set(fresh_hashtags))

{'Aliens', 'IWD2021', 'BTC', 'science', 'SolarAdrenalineOST', 'Ethiopian', 'astrophotography', 'ElonMusk', 'SmartNews', 'WomensDay', 'COVID19', 'Hubble30', 'ISS', 'SpacePicture', 'ISS_overLeHaillan', 'nft', 'APOD', 'bitcoin', 'WomensHistoryMonth', 'Astronomy', 'SpaceX', 'Moon_awards', 'Nasa', 'Louisville', 'Astrophotography', 'Starlink', 'Quantum', 'Hubble', 'KeepLookingUp', 'NEWS', 'RT', 'EXOLEAVINGSM', 'Technology', 'news', '10400DaysWithCHEN', 'internationalwomensday2021', 'Wallpaper', 'SN10', 'AskNASA', 'astrology', 'LPSC2021', 'NASA_App', 'dearMoonCrew', 'job', 'OTD', 'Science', 'Universe', 'ASTRO', 'InternationalWomensDay', 'StarTrek', 'UFO', 'MarsDay21', 'OVNI', 'Tigray', 'InternationalSpaceStation', 'EU', 'exoplanet', 'StormHour', 'onlineclasses', 'SLS', 'Aerospace', 'MarsPerseverance', 'Chicago', 'Venus', 'USA', 'mars2021', 'Nursing', 'perseverance', 'CountdownToMars', 'SPACE', 'AstroNomoLogy', 'hindi', 'VirginGalactic', 'crypto', 'Mars2021', 'Myanmar', 'Statistics', 'Space', 

In [19]:
reset(df)
for i in range(len(df)):
    df.at[i, 'text'] = clean_string(str(df.at[i, 'text']))
    encoded_string = df.at[i, 'text'].encode("ascii", "ignore")
    df.at[i, 'text'] = encoded_string.decode()

In [20]:
# df.to_csv('clean_tweet')

In [21]:
reset(df)
one_big_list = []

for i in range(len(df)):
    for word in word_tokenize(str(df.at[i, 'text'])):
        for symbol in "'[],":
            word = word.replace(symbol, "")
        if word != '':
            if word.startswith('//') != True:
                if word.startswith('http') != True:
                    one_big_list.append(word)

In [22]:
vectorized = count_vectorize(one_big_list)

In [23]:
a_file = open("data/vect_twts.csv", "w", encoding="utf-8")

writer = csv.writer(a_file)
for key, value in vectorized.items():
    writer.writerow([key, value])

a_file.close()

In [24]:
new_df = pd.read_csv('data/vect_twts.csv')

In [25]:
new_df = new_df.sort_values('1', ascending=False)

In [26]:
add_df = pd.DataFrame(columns=['word', 'count'], index=range(1))
add_df.at[0, 'word'] = new_df.columns[0]
add_df.at[0, 'count'] = new_df.columns[1]

In [27]:
new_df = new_df.rename(columns={new_df.columns[0]:'word', new_df.columns[1]:'count'})
new_df = new_df.append(add_df)

In [28]:
new_df['count'] = new_df['count'].astype(int)
new_df = new_df.sort_values('count', ascending=False)

In [None]:
for i in range(len(new_df['word'])):
    spaced_word = f" {new_df.at[i, 'word']} "
    for j in range(len(df)):
        repost_count = df.at[j, 'repost_count']
        word_count = len(re.findall(spaced_word, f" {df.at[j, 'text']} "))
        new_df.at[i, "count"] += word_count * repost_count

In [None]:
new_df.to_csv('data/tw_text_counts_incl_rts.csv')

In [None]:
for i in range(len(df)):
    if type(df.at[i, 'hashtag']) == float:
        continue
    elif df.at[i, 'hashtag'] in df.at[i, 'text']:
        continue
    else:
        df.at[i, 'text'] = df.at[i, 'text'] + ' ' + df.at[i, 'hashtag']

In [None]:
df = df[['user_id', 'text', 'favorite_count','retweet_count', 'mentions',  'post_id']]
df = df.rename(columns={'retweet_count':'repost_count'})

In [None]:
df.to_csv('data/cleaned_tweets.3.30.csv')