In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import logging
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

sid = SentimentIntensityAnalyzer()


train_path = '../Train_Data/'
train_data_list = '../project-data/train.data.txt'

dev_path = '../Dev_Data/'
dev_data_list = '../project-data/dev.data.txt'

test_path = '../Test_Data/'
test_data_list = '../project-data/test.data.txt'

def extract_tweet(tweet_id, dir_path):
  try:
    with open(f'{dir_path}{tweet_id}.json') as json_file:
      data = json.load(json_file)
      return {'tweet_id': data['id'],
          'user_id' : data['user']['id'],
              'followers_count' : data['user']['followers_count'],
              'friends_count' : data['user']['friends_count'],
              'listed_count' : data['user']['listed_count'],
              'favourites_count' : data['user']['favourites_count'],
              'verified' : data['user']['verified'],
              'hastags' : data['entities']['hashtags'],
              'text' : data['text'],            
              'created_at' : data['created_at'],
              'in_reply_to_status_id' : data['in_reply_to_status_id'],
              'retweet_count': data['retweet_count'] ,   
              'favorite_count': data['favorite_count']}
  except FileNotFoundError:
    logging.error(f"Target {tweet_id} not found")
            
    
def extract_reply(tweet_id, dir_path):
  try:
    with open(f'{dir_path}{tweet_id}.json') as json_file:
      data = json.load(json_file)
      return {'text' : data['text'],            
              'created_at' : data['created_at']}
  except FileNotFoundError:
    logging.info(f"reply {tweet_id} not found")

  

def process_tweet_text(text):
  text = text.lower()
  text = re.sub(r'https?:\/\/\S+', '', text)
  text = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', text)
  text = re.sub(r'@\w+', '', text)
  return text


"""
process list of replies into single dict
"""
def process_replies(replies):
  if len(replies) ==0:
    return {'reply_avg_sent' : 0,
          'reply_sent_trend': 0, 
          'reply_text': ''
          }
  reply_df = pd.DataFrame(replies)
  reply_df.sort_values('created_at', inplace=True)
  reply_df['clean_text'] = reply_df.text.apply(process_tweet_text)
  reply_df['sent'] =  reply_df.text.apply(lambda x: sid.polarity_scores(x)['compound'])
  return {'reply_avg_sent' : reply_df.sent.mean(),
          'reply_sent_trend': 0, #is the reply sentiment up or down
          'reply_text': ' '.join(list(reply_df.clean_text))
          }



def process_data_file(list_file_path, tweet_path):
  logging.basicConfig(filename=f'{tweet_path}process.log', level=logging.INFO)
  with open(list_file_path, "r") as f:
    lines = list(f)
    target_tweets = []
    for line in lines:
      tweets = line.split(',')
      for idx, tweet in enumerate(tweets):
        reply_texts = ''
        reply_followers = 0
        reply_tweets = []
        if idx == 0: # target tweet
          tweet_dict = extract_tweet(tweet.strip(), tweet_path)
          if tweet_dict is None:
            logging.error('warning -- Missing a target tweet')
            break
        else:
          reply = extract_reply(tweet.strip(), tweet_path)
          if reply is not None:
            reply_tweets.append(reply)
      if tweet_dict is not None:
        target_tweets.append({**tweet_dict, **process_replies(reply_tweets)})
    return pd.DataFrame(target_tweets)

a = process_data_file(dev_data_list, dev_path)
        

In [5]:
a['clean_text'] = a.text.apply(process_tweet_text)

In [6]:
a[['clean_text','text']].to_clipboard()

In [7]:
import os, json
import pandas as pd

json_files = [pos_json for pos_json in os.listdir(test_path) if pos_json.endswith('.json')]
truncates =[]
for i in json_files:
    with open(f'{test_path}{i}') as json_file:
        data = json.load(json_file)
        truncates.append(data['truncated'])  
       # if data['truncated']:
           # print(i)

