In [1]:
import numpy as np
import pandas as pd
import re, sys, os, csv
from many_stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

In [2]:
#stop_words = list(get_stop_words('en'))         #About 900 stop words
#nltk_words = list(stopwords.words('english'))   #About 150 stop words
#stop_words.extend(nltk_words)

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower())
dictionary = Counter(words(open('dataset/wordlists/merged.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def fix_hashtag(text):
    text = text.group().split(":")[0]
    text = text[1:] # remove '#'
    try:
        test = int(text[0])
        text = text[1:]
    except:
        pass
    output = ' '.join(viterbi_segment(text)[0])
    #print(output)
    return output

def clean_tweet( tweet):
        tweet = tweet.lower()
        tweet = re.sub("(#[A-Za-z0-9]+)", fix_hashtag, tweet)
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def remove_stopwords(word_list):
        filtered_tweet=""
        for word in word_list:
            word = word.lower() 
            if word not in stopwords.words("english"):
                filtered_tweet=filtered_tweet + " " + word
        
        return filtered_tweet.lstrip()
    
def vectorise_label(label):
    """
    if label == "empty":return 0
    elif label == "sadness":return 3
    elif label == "enthusiasm":return 1
    elif label == "neutral":return 0
    elif label == "worry":return 3
    elif label == "surprise":return 2
    elif label == "love":return 2
    elif label == "fun":return 1
    elif label == "hate":return 4
    elif label == "happiness":return 1
    elif label == "boredom":return 0
    elif label == "relief":return 1
    elif label == "anger":return 5
    """
    
    if label == "elated":return 1
    elif label == "overjoyed":return 2
    elif label == "enjoy":return 3
    elif label == "excited":return 4
    elif label == "proud":return 5
    elif label == "joyful":return 6
    elif label == "feelhappy":return 7
    elif label == "sohappy":return 8
    elif label == "veryhappy":return 9
    elif label == "happy":return 10
    elif label == "superhappy":return 11
    elif label == "happytweet":return 12
    elif label == "feelblessed":return 13
    elif label == "blessed":return 14
    elif label == "amazing":return 15
    elif label == "wonderful":return 16
    elif label == "excelent":return 17
    elif label == "delighted":return 18
    elif label == "enthusiastic":return 19 #Happy-Active
    elif label == "placid":return 20
    elif label == "calm":return 21
    elif label == "calming":return 22
    elif label == "peaceful":return 23
    elif label == "quiet":return 24
    elif label == "silent":return 25
    elif label == "serene":return 26
    elif label == "convinced":return 27
    elif label == "consent":return 28
    elif label == "contented":return 29
    elif label == "contentment":return 30
    elif label == "satisfied":return 31
    elif label == "relax":return 32
    elif label == "relaxed":return 33
    elif label == "relaxing":return 34
    elif label == "sleepy":return 35
    elif label == "sleepyhead":return 36
    elif label == "asleep":return 37
    elif label == "resting":return 38
    elif label == "restful":return 39 #Happy-Inactive
    elif label == "nervous":return 40
    elif label == "anxious":return 41
    elif label == "tension":return 42
    elif label == "afraid":return 43
    elif label == "fearful":return 44
    elif label == "angry":return 45
    elif label == "annoyed":return 46
    elif label == "annoying":return 47
    elif label == "stress":return 48
    elif label == "distressed":return 49
    elif label == "distress":return 50
    elif label == "stressful":return 51
    elif label == "stressed":return 52
    elif label == "worried":return 53
    elif label == "tense":return 54
    elif label == "bothered":return 55
    elif label == "disturbed":return 56
    elif label == "irritated":return 57
    elif label == "mad":return 58
    elif label == "furious":return 59 #Unhappy-Active
    elif label == "sad":return 60
    elif label == "ifeelsad":return 61
    elif label == "feelsad":return 62
    elif label == "sosad":return 63
    elif label == "verysad":return 64
    elif label == "sorrow":return 65
    elif label == "disappointed":return 66
    elif label == "supersad":return 67
    elif label == "miserable":return 68
    elif label == "hopeless":return 69
    elif label == "depress":return 70
    elif label == "depressed":return 71
    elif label == "depression":return 72
    elif label == "fatigued":return 73
    elif label == "gloomy":return 74
    elif label == "nothappy":return 75
    elif label == "unhappy":return 76
    elif label == "suicidal":return 77
    elif label == "downhearted":return 78
    elif label == "hapless":return 79
    elif label == "dispirited":return 80 #Unhappy-Inactive
    

In [3]:
def read_csv(file, lst):
    with open(file, newline='') as f:
        reader = csv.reader(f)
        for i,row in enumerate(reader):
            if i > 0:
                tweet = clean_tweet(str(row[1]))
                lst.append(tweet)

In [7]:
hate = []
read_csv("hate.csv", hate)
"""worry = []
read_csv("raw_data/worried.csv", worry)
#read_csv("worry.csv", worry)
read_csv("raw_data/anxiety.csv", worry)
happy = []
read_csv("raw_data/happy.csv", happy)
sad = []
read_csv("raw_data/sad.csv", sad)"""

dataWriter = csv.writer(open('hate_processes.csv', 'w'), delimiter=',',lineterminator="\n")
for tweet in hate:
    dataWriter.writerow([tweet, 8])

In [9]:
count = 0
with open('data_new.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        count+=1
print(count)

5309
