In [1]:
import numpy as np
import pandas as pd
import itertools
import json
import os
import re
import time
import pickle
from collections import Counter

# Select tweets | Keyword streaming 
## **SNLP team project**

In this notebook, the original raw dataset will be preprocessed via simple filtering to ensure quality of the data.

**Notebook implements following functionality:**
* Drop redundant tweet types 
* Drop corrupted tweets
* Filter based on language
* Write tweets as json files

In [2]:
def detect_language(data, LANG):
    bool_arr = []
    
    for text in data['text']:
        not_lang = not is_language(LANG, text, bool_arr)
        bool_arr.append(not_lang)
        
    data.drop(data[bool_arr].index, inplace=True)

def is_language(lang, text, bool_arr):
    try:
        if langdetect.detect(text) == lang:
            return True
        else:
            return False
    except Exception as e:
        return False

def drop_non_statuses(data):
    notice_types = ['delete','event','direct_message',
                    'friends','limit','disconnect',
                    'warning','scrub_geo','status_withheld','user_withheld']
    
    for notice_type in notice_types:
        if notice_type in data.columns:
            data.drop(data[data[notice_type].notnull()].index, inplace=True)    

def select_tweets(data):
    # 1. Drop queried msg's other than statuses
    drop_non_statuses(data)
    
    # 2. Select tweets based on language and use langdetect if feasible 
    LANG = 'en'
    DETECT_LANG = False
    
    cond1_arr = data['lang'] == LANG
    if DETECT_LANG:
        data_non_fi = data.copy()
        
        # Keep lang == fi
        data.drop(data[cond1_arr == False].index, inplace=True)
        
        # For set of 'lang' != 'fi' tweets, detect using langdetect
        data_non_fi.drop(data_non_fi[cond1_arr].index, inplace=True)
        orig_shape = data_non_fi.shape
        detect_language(data_non_fi,LANG)

        # Combine sets
        data = pd.concat([data,data_non_fi])
        print('\nLang detection completed:')
        print('\t* {} additional tweets identified in batch of {}'.format(data_non_fi.shape[0], csize))
        print('\t* {:.2f} % of the non-fi set\n'.format(data_non_fi.shape[0] / orig_shape[0] * 100))
        
    else:
        data.drop(data[cond1_arr == False].index, inplace=True)


### Loading the data

In [17]:
rawpath = '../../tweet_data/raw/tweets_climate_en_20200812.txt'
savepath = '../../tweet_data/filtered/tweets_climate_en_{}.json'

csize = 200000
data_file = None

In [18]:
global_count = 0
iter_times = []
n_extracted_ls = []

In [None]:
import time
times = []

for i,fpath in enumerate([rawpath]):
    print('Preprocessing file: {}'.format(fpath))
    stime = time.time()
    date = re.findall('[0-9]+', fpath)[0]
    data_iter = pd.read_json(fpath,  orient = "records", 
                        dtype = False, lines = True, 
                        encoding = "utf-8", chunksize = csize)

    data = iter(data_iter)
    nodes = {}
    edges = {}
    
    j = 0
    while True:
        s_time_iter = time.time()
        try:
            df_chunk = next(data)
        
            # Preprocess the dataframe
            select_tweets(df_chunk)
            
            if data_file is None:
                data_file = df_chunk.copy()
            else:
                data_file = pd.concat([data_file, df_chunk])
        
            n_extracted = df_chunk.shape[0]
            n_extracted_ls.append(n_extracted)
            global_count += df_chunk.shape[0]
            
            #Print iteration stats
            iter_time = time.time()-s_time_iter
            iter_times.append(iter_time)
            
            print('* Number of tweets processed: {} per file'.format((j+1)*csize))
            print('* Relevant tweets identified: \n\t\t\t- {} iteration / {} overall'.format(n_extracted, global_count)) 
            print('\t\t\t- On avg: {:.2f} iteration (+/-) {:.0f}'.format(np.mean(n_extracted_ls), np.std(n_extracted_ls)))
            print('\t\t\t-         {:.2f} %         (+/-) {:.2f} %'.format(100*np.mean(n_extracted_ls) / csize, 100*np.std(n_extracted_ls) / csize))
            print('* Time for iteration {}: {:.0f} s'.format(j+1,iter_time))
            print('* Avg time for iteration: {:.1f} s'.format(np.mean(iter_times)))
            print(50*'==','\n')
            
        except StopIteration:
            break
        except Exception as e:
            print('In file: {} at {}th iteration, exception occurred: {}'.format(fpath,j,e))
            j += 1
            continue
        
        j += 1
    
        with open(savepath.format(date), 'a') as file:
            data_file.to_json(file, orient = "records", lines = True)
        
        data_file = None
    
    offset = time.time() - stime
    times.append(offset)
    m_time = np.mean(times)
    
    k = len(filepaths) - (i+1)
    t_left = m_time * k 
    h_left = t_left // 3600
    m_left = (t_left % 3600) // 60
    s_left = (t_left % 60) 
    print(50*'=')
    print('Time for iteration: {0} minutes, {1:.2f} seconds'.format(offset // 60, offset % 60 ))
    print('Estimated time left: {0} hours, {1} minutes, {2:.2f} seconds'.format(h_left, m_left, s_left))
    print('Ready with file: {}'.format(fpath.replace('../../data_collection/content-based/', '')))
    print('\t* # of edges: {0}\n\t* # of nodes: {1}\n'.format(len(edges), len(nodes.keys())))

Preprocessing file: ../../tweet_data/raw/tweets_climate_en_20200812.txt
