In [2]:
import numpy as np
import pandas as pd

In [3]:
#text cleaning code
import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
#word U.S. and can't are getting in bad form after cleanup
def clean_tweets(tweet,remove_stopword,remove_punctuation,remove_emoticons):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if ((word not in stopwords_english or (not remove_stopword)) and # remove stopwords
              (word not in emoticons or (not remove_emoticons)) and # remove emoticons
                word not in string.punctuation or (not remove_punctuation)): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
    

In [4]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

def no_characters(words):
  chars=0
  for word in words:
    chars=chars+len(word)
  return chars

def cosine(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def jaccard(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    union =set(vec1.keys()) | set(vec2.keys())
    return len(intersection)/len(union)

def dice(tweet,query):
  vec1 = text_to_vector(tweet.lower())
  vec2 = text_to_vector(query.lower())
  intersection = set(vec1.keys()) & set(vec2.keys())
  return 2*len(intersection)/(len(vec1)+len(vec2))

In [5]:
# https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math

def count_words(tweet):
  x=word_tokenize(tweet)
  print(x)
  return len(x)

def get_doc(docs):
  doc_info=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    count=len(clean_tweets(doc,False,False,True))
    doc_info.append({'doc_id':i,'query':query,'doc_length':count})
  return doc_info

def create_freq_dict(docs):
  freqDist_list=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    freq_dict={}
    words=clean_tweets(doc,False,False,True)
    for word in words:
      if word in freq_dict:
        freq_dict[word]+=1
      else:
        freq_dict[word]=1
    freqDist_list.append({'doc_id':i,'query':query,'freq_dict':freq_dict})
  return freqDist_list
def computeTF(doc_info, freqDict_list):
  TF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    tf_score={}
    for key in tempDict['freq_dict']:
      tf_score[key]=tempDict['freq_dict'][key]/doc_info[id-1]['doc_length']
    temp={'doc_id':id,'query':query,'TF_score':tf_score}
    TF_scores.append(temp)
  return TF_scores

def computeIDF(doc_info, freqDict_list):
  IDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    idf_score={}
    for key in tempDict['freq_dict'].keys():
      count=sum([key in tempDict['freq_dict'] for tempDict in freqDict_list])
      idf_score[key]=math.log(len(doc_info)/count)
    IDF_scores.append({'doc_id':id,'query':query,'IDF_score':idf_score})
  return IDF_scores

def TFIDF_similarity(TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, clean_tweets,tweets):
  TFIDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    sumOfSquareWeights=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        sumOfSquareWeights=sumOfSquareWeights+idf_t**2
        score=score+(math.sqrt(tf_t_d))*(idf_t**2)*doc_info[id-1]['doc_length']
    coord_factor_q_d=cosine_similarity[id-1]
    # queryNorm=1/math.sqrt(sumOfSquareWeights)#sometimes cause float division error
    score=score*coord_factor_q_d#*queryNorm
    TFIDF_scores.append(score)
  return TFIDF_scores

def Okapi_BM25(k,b,TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, tweet_lengths,clean_tweets,tweets):
  OKAPI_scores=[]
  average_doc_length=tweet_lengths.sum()/len(tweet_lengths)
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        numerator=(k+1)*tf_t_d
        denominator=k*(1-b+b*(tweet_lengths[id-1]/average_doc_length))+tf_t_d
        score=score+idf_t*(numerator/denominator)
    OKAPI_scores.append(score)
  return OKAPI_scores

In [50]:
# encoding: utf-8 
import csv
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
queries=['narendra modi','artificial intelligence','coronavirus','pakistan','INDvAUS']
trainsets=[]
for query in queries:
    d=pd.read_csv('data/'+query+'.csv')
    
    d['cleaned_tweet']=[" ".join(clean_tweets(tweet,True,True,True)) for tweet in d['tweet']]
    d['cleaned_query']=[" ".join(clean_tweets(query,True,True,True)) for query in d['Query']]                             
    d['rank']=[d.shape[0]-i for i in range(0,d.shape[0])]
    d['how_old']=[(datetime.now()-datetime.strptime(date, '%I:%M %p · %d %b %Y')).total_seconds() for date in d['date']]
    d['cosine']=[cosine(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['jaccard']=[jaccard(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['url_bool']=[(0 if i==0 else 1) for i in d['url_count']]
    d['hashtag_count']=[(0 if i is np.nan else len(i.split(','))) for i in d['tags']]
    d['hashtag_bool']=[(0 if i==0 else 1) for i in d['hashtag_count']]
    d['dice']=[dice(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['word_count']=[len(tweet.split(' ')) for tweet in d['tweet']]
    d['char_count']=[no_characters(tweet.split(' ')) for tweet in d['tweet']]
    d['follower_friend_relation']=[0 if created_on is np.nan else 100*max(1,followers-friends)/(datetime.now()-datetime.strptime(created_on, '%Y-%m-%d %H:%M:%S')).total_seconds() for followers,friends,created_on in d[['followers_count','friends_count','created_at']].itertuples(index=False)]
    
    doc_info=get_doc(d[['cleaned_tweet','cleaned_query']])
    freqDict_list=create_freq_dict(d[['cleaned_tweet','cleaned_query']])
    TF_scores = computeTF(doc_info,freqDict_list)
    IDF_scores=computeIDF(doc_info,freqDict_list)
    TFIDF_scores=TFIDF_similarity(TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['cleaned_tweet'],d['tweet'])
    d['tfidf_similarity']=TFIDF_scores
    OKAPI_scores=Okapi_BM25(0.75,0.5,TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['word_count'],d['cleaned_tweet'],d['tweet'])
    d['okapi']=OKAPI_scores
    numerical_features=d[['followers_count','friends_count','listed_count','likes','comments','retweets','sum_followers_mention','url_count','how_old','cosine','jaccard','hashtag_count','dice','word_count','char_count','follower_friend_relation','tfidf_similarity','okapi']]

    #to  scale features
#     numerical_features=pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    data=pd.concat([d[['Query','rank','verified','Img_present','url_bool','hashtag_bool']],numerical_features],axis=1)
    trainsets.append(data)
    print(data.columns) 


Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'how_old', 'cosine',
       'jaccard', 'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'how_old', 'cosine',
       'jaccard', 'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_co

In [51]:
file1 = open("MyFile.txt","w")
label_data = {}
# for query in queries:
#     label_data[query] = {}
for index,trainset in enumerate(trainsets):
    divider=math.ceil(len(trainset)/10)
    for index2,row in enumerate(trainset.values):
        s=""
        s=s+str(min(9,int(row[1]//divider)))
        s=s+" "+"qid:"+str(index+1)
        for j,data in enumerate(row[2:]):
            s=s+" "+str(j+1)+":"+str(data)
        s=s+" "+"#docid = "+str(index2+1)+"_of_"+row[0]
        s=s+"\n"
        if index+1 not in label_data:
            label_data[index+1] = {}
        label_data[index+1][index2+1] = min(9,int(row[1]//divider))
#         print(s)
        file1.write(s)
#         if index2>5:
#             break
file1.close()


In [64]:
f = open("RankNetRankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
    y = x.split()
    value = y[6]
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    i = i[4]
    if q_id not in ans:
        ans[q_id] = []
    ans[q_id].append([y[6],i])



In [65]:
for key in ans.keys():
#     print(key)
#     print(ans[key])
    ans[key].sort(reverse = True)
    #print(ans[key])
    li = []
    for i in ans[key]:
#         print(label_data[int(key)][int(i[1])])
        if int(i[1]) in label_data[int(key)]:
            label = label_data[int(key)][int(i[1])]
            li.append([i[1],label_data[int(key)][int(i[1])]])
        else :
            print(key + " " + i[1])

    print(li) 
    print("change/n")

[['1051', 1], ['1014', 1], ['1042', 1], ['889', 2], ['1054', 1], ['1053', 1], ['823', 3], ['1114', 1], ['1008', 1], ['1039', 1], ['99', 9], ['1040', 1], ['1007', 1], ['1041', 1], ['1057', 1], ['1186', 0], ['1037', 1], ['1232', 0], ['1000', 1], ['700', 4], ['1239', 0], ['98', 9], ['1032', 1], ['991', 2], ['1055', 1], ['938', 2], ['985', 2], ['1036', 1], ['987', 2], ['828', 3], ['1030', 1], ['1028', 1], ['97', 9], ['1011', 1], ['1003', 1], ['1035', 1], ['977', 2], ['1237', 0], ['1172', 0], ['1031', 1], ['967', 2], ['989', 2], ['986', 2], ['96', 9], ['961', 2], ['761', 3], ['993', 2], ['962', 2], ['569', 5], ['992', 2], ['983', 2], ['951', 2], ['373', 6], ['649', 4], ['95', 9], ['964', 2], ['963', 2], ['730', 4], ['1018', 1], ['1017', 1], ['1016', 1], ['995', 2], ['1006', 1], ['1021', 1], ['958', 2], ['94', 9], ['1004', 1], ['1020', 1], ['996', 2], ['1015', 1], ['994', 2], ['576', 5], ['947', 2], ['521', 5], ['568', 5], ['984', 2], ['93', 9], ['919', 2], ['701', 4], ['998', 1], ['999', 1]