### Feature extraction from tweets

In [1]:
# import important libraries
import os,sys
import json
import nltk
from nltk.corpus import stopwords
import string
import pickle
import re
from textblob import TextBlob
import csv
import numpy as np

In [2]:
# import important libraries
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.tokenize import RegexpTokenizer

unable to import 'smart_open.gcs', disabling that module


In [3]:
# getting pre-trained Word2Vec model
filename = os.getcwd() + '/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True, limit=100000)
stop = set(nltk.corpus.stopwords.words('english'))
print ("done")

done


#### Defining Data class that will read each tweet and get different features

In [7]:
class Data:

    def __init__ (self,structure, tweet_data, source):
        self.structure = structure
        self.tweet_data = tweet_data
        self.source = source

    def getfeature(self, tweet):
        text = tweet["text"]
        feature = []
        words = nltk.word_tokenize(text)

        tokenizer = RegexpTokenizer(r'\w+')
        word_nopunc = tokenizer.tokenize(text)
        word_nopunc = [i for i in  word_nopunc if i not in stop]
        
        # getting features using word2vec
        for i in word_nopunc:
            if i in model.wv:
                feat_list = model.wv[i].tolist()
                feature.extend(feat_list)
        
        #append 0 if no feature found
        if (len(feature) < 2000):
            for i in range(len(feature),2001):
                feature.append(0)
        feature = feature[:2000]

        # Has question marks
        if text.find('?') > 0:
            feature.append(1)
        else:
            feature.append(0)

        # has exclamation marks 
        if text.find('!') > 0:
            feature.append(1)
        else:
            feature.append(0)

        # has hastags
        if (len(tweet['entities']['hashtags']) > 0):
            feature.append(1)
        else:
            feature.append(0) 

        # has usermentions
        if (len(tweet['entities']['user_mentions']) > 0):
            feature.append(1)
        else:
            feature.append(0)

        # has urls
        if (len(tweet['entities']['urls']) > 0):
            feature.append(1)
        else:
            feature.append(0)

        # has media
        if ('media' in tweet['entities']):
            feature.append(1)
        else:
            feature.append(0)
        
        # sentiment analysis
        clean_tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())
        analysis = TextBlob(clean_tweet)

        if analysis.sentiment.polarity > 0:
            feature.append(1)
        else:
            feature.append(0)
        
        # Capital to lower case ratio
        uppers = [l for l in text if l.isupper()]
        capitalratio = len(uppers)/len(text)	
        feature.append(capitalratio)

        count_punct = 0
        
        # negative words list 
        neg_words = ["not", "no", "nobody", "nothing", "none", "never", "neither", "nor", "nowhere", "hardly","scarcely", "barely", "don’t", "isn’t", "wasn’t", "shouldn’t", "wouldn’t", "couldn’t", "doesn’t"]
        count_neg_words = 0
        
        # count number of punctuations and negative words
        for i in words:
            if (i in (string.punctuation)):
                count_punct += 1
            if (i in neg_words):
                count_neg_words += 1

        feature.append(count_punct)
        feature.append(count_neg_words)
        
        # count the number of bad words
        swearwords = []
        
        with open('../DataSet/badwords.txt', 'r') as f:
            for line in f:
                swearwords.append(line.strip().lower())

        hasswearwords = 0
        for token in word_nopunc:
            if token in swearwords:
                hasswearwords += 1
        feature.append(hasswearwords)

        # returning list of features
        return feature

    def extract_features(self):
        # function to extract the above features
        feat_dict = {}
        for i in self.tweet_data:
            feat_dict[i] = self.getfeature(self.tweet_data[i])
        return feat_dict

In [8]:
path = '../DataSet/pheme-dataset/threads/en'    # Path for training dataset

data = []
fold = os.listdir(path)

# Read DATA

for k in fold:
    temp_files = path + '/' + k
    lis = []
    temp_inner = os.listdir(temp_files)

    # Get data for each topic 
    for i in temp_inner:
        temp_source = temp_files + '/' + i + '/source-tweets/'
        temp_replies = temp_files + '/' + i + '/reactions/'
        temp_struct = temp_files + '/' + i 

        # store structure of tweets
        with open(temp_struct + '/structure.json') as f:
            structure = json.load(f)

        # store source tweet
        source_file = os.listdir(temp_source)
        source = source_file[0].split('.')[0]

        # store all twitter data
        tweet_data = {}
        with open(temp_source + source_file[0]) as f:
            tweet_data[source] = (json.load(f))

        reply_file = os.listdir(temp_replies)
        for j in reply_file:
            with open(temp_replies + j) as f:
                tweet_data[j.split('.')[0]] = (json.load(f))
        
        lis.append(Data(structure, tweet_data, source))
    data.append(lis)
    
# Find feature vectors for each tweet
X_data = {}
for i in data:
    for j in i:
        X_data = dict(list(X_data.items()) + list(j.extract_features().items()))

# get training labels
X_label = {}
path = '../DataSet/train_data.json'
with open(path) as f:
    X_label = json.load(f)

# Get Test data
path = '../DataSet/test-data/'    # Path to read the test data
fold = os.listdir(path)
test_data = []

# Get data for each topic
for i in fold:
    temp_source = path + '/' + i + '/source-tweets/'
    temp_replies = path + '/' + i + '/reactions/'
    temp_struct = path + '/' + i 

    # store structure of tweets
    with open(temp_struct + '/structure.json') as f:
        structure = json.load(f)

    # store source tweet
    source_file = os.listdir(temp_source)
    source = source_file[0].split('.')[0]

    # store all twitter data
    tweet_data = {}
    with open(temp_source + source_file[0]) as f:
        tweet_data[source] = (json.load(f))

    reply_file = os.listdir(temp_replies)
    for j in reply_file:
        with open(temp_replies + j) as f:
            tweet_data[j.split('.')[0]] = (json.load(f))

    test_data.append(Data(structure, tweet_data, source))

# get testing features
Y_data = {}
for i in test_data:
    Y_data = dict(list(Y_data.items()) + list(i.extract_features().items()))

# get testing labels
Y_label = {}
path = '../DataSet/test_label.json'
with open(path) as f:
    Y_label = json.load(f)


# Dumping Training dataset features and labels
f = open("train.pkl", "wb")
pickle.dump((X_data, X_label), f)
f.close()

# Dumping Testing dataset features and labels

f = open("test.pkl", "wb")
pickle.dump((Y_data, Y_label), f)
f.close()

