In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

## looking into data

In [3]:
tweets_df = pd.read_csv("dataset/javascript_top.csv")
tweets_df.head().T

Unnamed: 0,0,1,2,3,4
body,"""Return [data, loading, error];}"" http://socia...","Late tweet, 5th 6th (Days 53, 54). Very little...",Vue.js 2 Essentials: Build Your First Vue App☞...,When u write 500 lines of codes and exit witho...,My #first chat app hope to collaborate #javasc...
likes,0,0,0,2,
link,/eibrahim/status/1093513673618964480,/L1K3R0535/status/1093513800098242565,/javascript_devv/status/1093514298486341632,/BarineSambaris/status/1092896518573568008,/GoodwishSifiso/status/1093224111919251456
replies,0,0,0,3,
retweets,,3,0,0,0
time,6:15 AM - 7 Feb 2019,6:16 AM - 7 Feb 2019,6:18 AM - 7 Feb 2019,1:23 PM - 5 Feb 2019,11:05 AM - 6 Feb 2019
writer,@eibrahim,@L1K3R0535,@javascript_devv,@BarineSambaris,@GoodwishSifiso


### we found that all missing values are in fact  0 so we change them to 0

In [4]:
tweets_df.replies.fillna(0,inplace=True)
tweets_df.retweets.fillna(0,inplace=True)
tweets_df.likes.fillna(0,inplace=True)

### now we will convert our time feature to 3 features day and month and year

In [5]:
months = {'jan':1, 'feb':2, 'mar':3, 'apr':4,
          'may':5, 'jun':6, 'jul':7, 'aug':8,
          'sep':9, 'oct':10, 'nov':11, 'dec':12}

def get_time(date_time):
    return date_time.split('-')[0].strip()

def get_day(date_time):
    return int(date_time.split('-')[1].strip().split(' ')[0].strip())

def get_month(date_time):
    return months[date_time.split('-')[1].strip().split(' ')[1].strip().lower()]

def get_year(date_time):
    return int(date_time.split('-')[1].strip().split(' ')[2].strip())

In [6]:
def get_time_column(df, column_name):
    date_list = []
    column = df[column_name]
    for item in column:
        time = get_time(item)
        day = get_day(item)
        month = get_month(item)
        year = get_year(item)
        date_list.append(f'{time} - {day}/{month}/{year}')
    return pd.to_datetime(date_list, format='%I:%M %p - %d/%m/%Y')
tweets_df['time'] = get_time_column(tweets_df, 'time')

## Content exploration per period

In [7]:
def get_tweets_between(start_hour, start_day, start_month, start_year,
                            end_hour, end_day, end_month, end_year):
    start_time = datetime.datetime(start_year, start_month, start_day, start_hour)
    end_time = datetime.datetime(end_year, end_month, end_day, end_hour)
    return tweets_df[(tweets_df.time >= start_time) & (tweets_df.time < end_time)]

In [8]:
tweets=get_tweets_between(1,15,1,2019,8,15,1,2019)

In [9]:
tweets.body.iloc[0]

'Never give up. Go over,Go under, go around,Grow through and Never give up  #100daysofcode #javascript #womenwhocode #vuejs #reactjs #angularjs'

In [10]:
def get_tweets_top_tokens(tweets,topN=10):
    tweetsVectorizer = TfidfVectorizer(use_idf=True,min_df=1,lowercase=True,stop_words='english',ngram_range=(2,2))
    tweetsDenseMatrix= tweetsVectorizer.fit_transform(tweets.body).todense() #fit the vectorizer to synop#ses
    #denseMatrix=tweetsMatrix.todense()
    topTokens=[]
    for i in range(len(tweets)):
        tops=tweetsDenseMatrix[i].argsort(axis=1)[0,-topN:]
        tops=tops.tolist()[0]
        topTokensInTweet=[]
        tokensDF=pd.DataFrame(columns=['token','score'])
        for item in tops:
            topTokensInTweet.append({"token":tweetsVectorizer.get_feature_names()[item],"score":round(tweetsDenseMatrix[i,item],5)})
        topTokens.extend(topTokensInTweet)
    topTokens.sort(key=lambda x: x['score'], reverse=True)
    return topTokens[:topN]

In [12]:
get_tweets_top_tokens(tweets)

[{'score': 0.427, 'token': 'javascript womenwhocode'},
 {'score': 0.427, 'token': 'womenwhocode vuejs'},
 {'score': 0.427, 'token': 'reactjs angularjs'},
 {'score': 0.427, 'token': '100daysofcode javascript'},
 {'score': 0.427, 'token': 'grow 100daysofcode'},
 {'score': 0.35796, 'token': 'introduction laravel'},
 {'score': 0.33564, 'token': 'best coursera'},
 {'score': 0.33564, 'token': 'certifications specializations'},
 {'score': 0.32586, 'token': 'css text'},
 {'score': 0.32586, 'token': 'text shaking'}]