In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

## looking into data

In [None]:
tweets_df = pd.read_csv("dataset/javascript_top.csv")
tweets_df.head().T

In [4]:
tweets_df.isna().sum()

body          0
likes        17
link          0
replies     172
retweets     40
time          0
writer        0
dtype: int64

### we found that all missing values are in fact  0 so we change them to 0

In [None]:
tweets_df.replies.fillna(0,inplace=True)
tweets_df.retweets.fillna(0,inplace=True)
tweets_df.likes.fillna(0,inplace=True)

### now we will convert our time feature to 3 features day and month and year

In [None]:
months = {'jan':1, 'feb':2, 'mar':3, 'apr':4,
          'may':5, 'jun':6, 'jul':7, 'aug':8,
          'sep':9, 'oct':10, 'nov':11, 'dec':12}

def get_time(date_time):
    return date_time.split('-')[0].strip()

def get_day(date_time):
    return int(date_time.split('-')[1].strip().split(' ')[0].strip())

def get_month(date_time):
    return months[date_time.split('-')[1].strip().split(' ')[1].strip().lower()]

def get_year(date_time):
    return int(date_time.split('-')[1].strip().split(' ')[2].strip())

In [None]:
def get_time_column(df, column_name):
    date_list = []
    column = df[column_name]
    for item in column:
        time = get_time(item)
        day = get_day(item)
        month = get_month(item)
        year = get_year(item)
        date_list.append(f'{time} - {day}/{month}/{year}')
    return pd.to_datetime(date_list, format='%I:%M %p - %d/%m/%Y')
tweets_df['time'] = get_time_column(tweets_df, 'time')
tweets_df['hour'] = tweets_df.time.dt.hour 

## Content exploration per period

In [None]:
def get_top_content(df, periods):
    top_content = []
    df['time-period'] = df.hour//4
    for period in range(periods):
        top_content.append(df[df['time-period'] == period])
    df.drop(columns=['time-period'], inplace=True)
    return top_content

In [None]:
x = get_top_content(tweets_df, 6)

In [None]:
x[0].hour.unique()

In [None]:
tweets=get_tweets_between(1,15,1,2019,8,15,1,2019)

In [None]:
tweets.body.iloc[0]

In [None]:
def get_tweets_top_tokens(tweets,topN=10):
    tweetsVectorizer = TfidfVectorizer(use_idf=True,min_df=1,lowercase=True,stop_words='english',ngram_range=(2,2))
    tweetsDenseMatrix= tweetsVectorizer.fit_transform(tweets.body).todense() #fit the vectorizer to synop#ses
    #denseMatrix=tweetsMatrix.todense()
    topTokens=[]
    for i in range(len(tweets)):
        tops=tweetsDenseMatrix[i].argsort(axis=1)[0,-topN:]
        tops=tops.tolist()[0]
        topTokensInTweet=[]
        tokensDF=pd.DataFrame(columns=['token','score'])
        for item in tops:
            topTokensInTweet.append({"token":tweetsVectorizer.get_feature_names()[item],"score":round(tweetsDenseMatrix[i,item],5)})
        topTokens.extend(topTokensInTweet)
    topTokens.sort(key=lambda x: x['score'], reverse=True)
    return topTokens[:topN]

In [None]:
get_tweets_top_tokens(tweets)