### Imports and setup

In [None]:
# imports
import json
import pickle
import pandas as pd

# setup for logging
import logging
from datetime import datetime

# write logs with time to log folder
LOG_FILENAME = datetime.now().strftime('~/log/logfile_%H_%M_%S_%d_%m_%Y.log')

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)

# get data
with open('data.json', 'r') as f:
    datastore = json.load(f)

# make dataframe
df = pd.DataFrame(datastore)

logging.info('Dataframe created')

### Basic cleaning

In [None]:
# count duplicate rows
count = df.id.duplicated().sum()
print("Number of duplicates before removal:", count)

# drop dups based on id
df_sample_sorted = df.sort_values('id', ascending=True)
df_sample_unique = df_sample_sorted.drop_duplicates(subset='id', keep='first')

# count dups again
count_expost = df_sample_unique.id.duplicated().sum()
print("Number of duplicates after removal:", count_expost)

logging.info("Dups removed")

# replace string timestamps with datetime
df_sample_unique['timestamp'] = df_sample_unique['timestamp'].astype('datetime64[ns]')

# sort by timestamp values
df_sample_bytime = df_sample_unique.sort_values('timestamp', ascending=True)

# reset index and drop old to avoid making new column with old index
df_sample_bytime = df_sample_bytime.reset_index(drop=True)

# add column with index values to use as id
df_sample_bytime_andindex = df_sample_bytime.reset_index(drop=False)

# drop columns _id, id
df_sample_final = df_sample_bytime_andindex.drop(['_id', 'id'], axis=1)

logging.info("Basic cleaning completed")

### Tweet cleaner

In [None]:
# import 
import preprocessor as p

# removes hashtags, mentions, emoji and URLs
df_sample_final['text'] = df_sample_final['text'].apply(p.clean)

logging.info("Preprocessor worked")

### Language filter

In [None]:
# imports
from langdetect import detect

# define function
def try_detect(cell):
    try:
        detected_lang = detect(cell)
    except:
        detected_lang = None
    return detected_lang

# apply function and store result in lang column
df_sample_final['lang'] = df_sample_final.text.apply(try_detect)

logging.info("Langdetect complete")

# keep only Turkish (includes dropna!)
target = ['tr']
df_sample_final.drop(df_sample_final[df_sample_final.lang.isin(target) == False].index , inplace=True)

logging.info("Other languages removed")

### Ensure separation of words

In [None]:
# remover function
def remover(stringput):
    dic={"'":" ","-":" ", ",":" ", "/":" ","_":" ","&":" ","*":" ",":":" ","+":" ",".":" ", "’":" ", "!":" ", "?":" ", "(":" ", ")":" ", ";":" "}
    output="".join((dic.get(x,x) for x in stringput))
    return output

# replaces apostrophs and joining elements with space to ensure separation of words
df_sample_final['text'] = df_sample_final['text'].apply(remover)

### Named Entity Recognition

In [None]:
# import NER toolkit
import polyglot
from polyglot.text import Text, Word
import subprocess

subprocess.run('polyglot download embeddings2.tr ner2.tr', shell=True)

%%bash
polyglot download embeddings2.tr ner2.tr

# define extractor function
def ner(cell):
    discover = Text(cell, hint_language_code='tr')
    return discover.entities

# apply function to the tweets and store output in labeled_ner
df_sample_final['labeled_ner'] = df_sample_final['text'].apply(ner)

logging.info("NER success")

### Stopwords

In [None]:
# import stopwords from .txt file
with open('STOPFILE.txt', 'r') as f:
    stopwords = []
    for line in f:
        stopwords.append(line.strip().split(','))
        
f.close()

# make list
stopwords = [i[0] for i in stopwords]

# remove dups in list to ensure clean output
stopwords = list(dict.fromkeys(stopwords))

# define function
def stopcleaner(cell):
    dic = {}
    tokens = cell.lower().split()
    for word in tokens:
        if word in stopwords:
            z = {word:""}
            dic.update(z)
    output = " ".join(dic.get(x,x) for x in tokens)
    return output

# tokenizes text feature and removes stop word noise
df_sample_final['text'] = df_sample_final['text'].apply(stopcleaner)

logging.info("Stopwords are out")

### Location mentions

In [None]:
# initialize list
places = []

# get file
infile = open('Neighborhoods.txt', 'r')
for line in infile:
    places.append(line.strip().split(','))

# close file
infile.close()

# get next
infile = open('POI.txt', 'r')
for line in infile:
    places.append(line.strip().split(','))

# close file
infile.close()

# get next
infile = open('Bridges.txt', 'r')
for line in infile:
    places.append(line.strip().split(','))

# close file
infile.close()

# get next
infile = open('Stations.txt', 'r')
for line in infile:
    places.append(line.strip().split(','))

# close file
infile.close()

# get next
infile = open('Mosques.txt', 'r')
for line in infile:
    places.append(line.strip().split(','))

# close file
infile.close()

# make proper list (not list of lists)
places = [i[0] for i in places]

# remove dups in list to ensure clean output
places = list(dict.fromkeys(places))

# getlocs function
def getlocs(cell):
    out = []
    for item in places:
        x = str(item)
        if x in cell or x.lower() in cell:
            out.append(item.lower())
    return out

# collects all location mentions
df_sample_final['ist_locations'] = df_sample_final['text'].apply(getlocs)

logging.info("Location mentions done")

### Extracting and lemmatizing informative words

In [None]:
#imports
from cube.api import Cube

#initialize
cube=Cube(verbose=True)

#load model
cube.load("tr")

def getall(cell):
    output = []
    sentences=cube(cell)
    target = ["VERB", "NOUN", "ADJ", "PRON"]
    for sentence in sentences:
        for entry in sentence:
            if entry.upos in target:
                output.append(str(entry.lemma))
    return output

# import
from multiprocess import Pool
import time

# initialize to 96 cores
pool = Pool(96)

# define series
df_tho = df_sample_final["text"]

# start timer
t0 = time.time()

# execute a computation in parallel
result = pool.map(getall, df_tho)

# turn off workers
pool.close()

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

# append result to dataframe
df_sample_final["content"] = result

logging.info("Informative words extracted")

### Transliteration 

In [None]:
# transliteration function

def transliterate(item):
    dic={"ı":"i","ğ":"g","ş":"s","ç":"c","ü":"u","ö":"o", "İ":"i", "Ö":"ö", "Ç":"ç", "Ü":"ü", "I":"i","Ş":"s"}
    output="".join((dic.get(x,x) for x in item))
    return output

# function that applies transliterate to a list
def transliminator(listput):
    output = []
    for item in listput:
        result = transliterate(item)
        output.append(result)
    return output

# apply function
df_sample_final['content_norm'] = df_sample_final['content'].apply(transliminator)

logging.info("Tweets transliminated")

### Combine outputs and normalize

In [None]:
# get list from list of list
def flattener(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

# flatten, transliminate, combine
df_sample_final["ner_flat"] = df_sample_final["labeled_ner"].apply(flattener)
df_sample_final["ner_flat"] = df_sample_final["ner_flat"].apply(transliminator)
df_sample_final["combined"] = df_sample_final["ner_flat"] + df_sample_final["ist_locations"] + df_sample_final["content_norm"]

# make final one .lower() and ensure each word only appears once

# function that makes each item in list .lower()
def lowerit(listput):
    output = []
    for item in listput:
        result = str(item.lower())
        output.append(result)
    return output

# remove dups in list to ensure clean output
def removedups(stuff):
    new = list(dict.fromkeys(stuff))
    return new

df_sample_final["combined"] = df_sample_final["combined"].apply(lowerit)
df_sample_final["combined"] = df_sample_final["combined"].apply(transliminator)
df_sample_final["combined"] = df_sample_final["combined"].apply(removedups)

# file export

with open('df_preprocessed.pkl', 'wb') as f:
    pickle.dump(df_sample_final, f)

f.close()

logging.info("File is out")