In [12]:
################################FUNCTION FOR CLEANING TEXT
#text cleaning code
import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
#word U.S. and can't are getting in bad form after cleanup
def clean_tweets(tweet,remove_stopword,remove_punctuation,remove_emoticons):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if ((word not in stopwords_english or (not remove_stopword)) and # remove stopwords
              (word not in emoticons or (not remove_emoticons)) and # remove emoticons
                word not in string.punctuation or (not remove_punctuation)): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
    

In [13]:
##########################UTILITY FUNCTIONS
# https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math

import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

def no_characters(words):
  chars=0
  for word in words:
    chars=chars+len(word)
  return chars

def cosine(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def jaccard(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    union =set(vec1.keys()) | set(vec2.keys())
    return len(intersection)/len(union)

def dice(tweet,query):
  vec1 = text_to_vector(tweet.lower())
  vec2 = text_to_vector(query.lower())
  intersection = set(vec1.keys()) & set(vec2.keys())
  return 2*len(intersection)/(len(vec1)+len(vec2))

def count_words(tweet):
  x=word_tokenize(tweet)
  print(x)
  return len(x)

def get_doc(docs):
  doc_info=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    count=len(clean_tweets(doc,False,False,True))
    doc_info.append({'doc_id':i,'query':query,'doc_length':count})
  return doc_info

def create_freq_dict(docs):
  freqDist_list=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    freq_dict={}
    words=clean_tweets(doc,False,False,True)
    for word in words:
      if word in freq_dict:
        freq_dict[word]+=1
      else:
        freq_dict[word]=1
    freqDist_list.append({'doc_id':i,'query':query,'freq_dict':freq_dict})
  return freqDist_list
def computeTF(doc_info, freqDict_list):
  TF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    tf_score={}
    for key in tempDict['freq_dict']:
      tf_score[key]=tempDict['freq_dict'][key]/doc_info[id-1]['doc_length']
    temp={'doc_id':id,'query':query,'TF_score':tf_score}
    TF_scores.append(temp)
  return TF_scores

def computeIDF(doc_info, freqDict_list):
  IDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    idf_score={}
    for key in tempDict['freq_dict'].keys():
      count=sum([key in tempDict['freq_dict'] for tempDict in freqDict_list])
      idf_score[key]=math.log(len(doc_info)/count)
    IDF_scores.append({'doc_id':id,'query':query,'IDF_score':idf_score})
  return IDF_scores

def TFIDF_similarity(TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, clean_tweets,tweets):
  TFIDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    sumOfSquareWeights=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        sumOfSquareWeights=sumOfSquareWeights+idf_t**2
        score=score+(math.sqrt(tf_t_d))*(idf_t**2)*doc_info[id-1]['doc_length']
    coord_factor_q_d=cosine_similarity[id-1]
    # queryNorm=1/math.sqrt(sumOfSquareWeights)#sometimes cause float division error
    score=score*coord_factor_q_d#*queryNorm
    TFIDF_scores.append(score)
  return TFIDF_scores

def Okapi_BM25(k,b,TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, tweet_lengths,clean_tweets,tweets):
  OKAPI_scores=[]
  average_doc_length=tweet_lengths.sum()/len(tweet_lengths)
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        numerator=(k+1)*tf_t_d
        denominator=k*(1-b+b*(tweet_lengths[id-1]/average_doc_length))+tf_t_d
        score=score+idf_t*(numerator/denominator)
    OKAPI_scores.append(score)
  return OKAPI_scores

In [17]:
#############CODE TO READ RAW DATA FROM FILE AND CONVERT INTO PANDAS DATAFRAME WITH COLUMNS AS FEATURES
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
queries=['4 Years of FAN','coronavirus','economical crisis','INDvAUS','kohli','MayThe4thBeWithYou','narendra modi','netflix','once upon a time','Pandemic','Silver Lake']
# queries=['artificial intelligence','womens day','pulitzer','T20 India']
trainsets=[]
top_n=True
for query in queries:
    d=pd.read_csv('data/'+query+'.csv')
    if top_n:
        d=d[:150]
    print(query)
    d['cleaned_tweet']=[" ".join(clean_tweets(tweet,True,True,True)) for tweet in d['tweet']]
    d['cleaned_query']=[" ".join(clean_tweets(query,True,True,True)) for query in d['Query']]                             
    d['rank']=[d.shape[0]-i for i in range(0,d.shape[0])]
    if not top_n:
        try:
            d['how_old']=[(datetime.now()-datetime.strptime(date, '%I:%M %p · %d %b %Y')).total_seconds() for date in d['date']]
        except:
            d['how_old']=[(datetime.now()-datetime.strptime(date, '%I:%M %p · %b %d, %Y')).total_seconds() for date in d['date']]
    d['cosine']=[cosine(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['jaccard']=[jaccard(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['url_bool']=[(0 if i==0 else 1) for i in d['url_count']]
    d['hashtag_count']=[(0 if i is np.nan else len(i.split(','))) for i in d['tags']]
    d['hashtag_bool']=[(0 if i==0 else 1) for i in d['hashtag_count']]
    d['dice']=[dice(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['word_count']=[len(tweet.split(' ')) for tweet in d['tweet']]
    d['char_count']=[no_characters(tweet.split(' ')) for tweet in d['tweet']]
    d['follower_friend_relation']=[0 if created_on is np.nan else 100*max(1,followers-friends)/(datetime.now()-datetime.strptime(created_on, '%Y-%m-%d %H:%M:%S')).total_seconds() for followers,friends,created_on in d[['followers_count','friends_count','created_at']].itertuples(index=False)]
    d['follower_friend_relation']=d['word_count']
    doc_info=get_doc(d[['cleaned_tweet','cleaned_query']])
    freqDict_list=create_freq_dict(d[['cleaned_tweet','cleaned_query']])
    TF_scores = computeTF(doc_info,freqDict_list)
    IDF_scores=computeIDF(doc_info,freqDict_list)
    TFIDF_scores=TFIDF_similarity(TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['cleaned_tweet'],d['tweet'])
    d['tfidf_similarity']=TFIDF_scores
    OKAPI_scores=Okapi_BM25(0.75,0.5,TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['word_count'],d['cleaned_tweet'],d['tweet'])
    d['okapi']=OKAPI_scores
    if not top_n:
        numerical_features=d[['followers_count','friends_count','listed_count','likes','comments','retweets','sum_followers_mention','url_count','how_old','cosine','jaccard','hashtag_count','dice','word_count','char_count','follower_friend_relation','tfidf_similarity','okapi']]
    else:
        numerical_features=d[['followers_count','friends_count','listed_count','likes','comments','retweets','sum_followers_mention','url_count','cosine','jaccard','hashtag_count','dice','word_count','char_count','follower_friend_relation','tfidf_similarity','okapi']]

#     to  scale features
#     numerical_features=pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    data=pd.concat([d[['Query','rank','verified','Img_present','url_bool','hashtag_bool']],numerical_features],axis=1)
    trainsets.append(data)
    print(data.columns) 


4 Years of FAN
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'cosine', 'jaccard',
       'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
coronavirus
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'cosine', 'jaccard',
       'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
economical crisis
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_foll

In [18]:
#########CODE TO WRITE THE INFO FROM DATAFRAME INTO A FILE IN FORMAT SAME AS LETOR DATASET
file1 = open("MyFile150.txt","w")
label_data = {}
score_data = {}
# for query in queries:
#     label_data[query] = {}
for index,trainset in enumerate(trainsets):
    divider=math.ceil(len(trainset)/10)
    for index2,row in enumerate(trainset.values):
        s=""
        s=s+str(min(9,int(row[1]//divider)))
        s=s+" "+"qid:"+str(index+1)
        for j,data in enumerate(row[2:]):
            s=s+" "+str(j+1)+":"+str(data)
            if j+1 == 13:
                date = data
        s=s+" "+"#docid = "+str(index2+1)+"_of_"+"_".join(row[0].split(" "))
        s=s+"\n"
        if index+1 not in label_data:
            label_data[index+1] = {}
            score_data[index+1] = []
        label_data[index+1][index2+1] = min(9,int(row[1]//divider))
        score_data[index+1].append([0,date,index2+1])
#         print(s)
        file1.write(s)
#         if index2>5:
#             break
file1.close()


In [1]:
import subprocess
import numpy as np
model = "rankBoost"
cmd=["java","-jar","RankLib-2.13.jar","-rank","MyTest.txt","-load",model+".txt","-norm","zscore","-indri",model+"RankedLists.txt"]
output=[]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
for line in proc.stdout.readlines():
    print(line)
    output.append(line[:len(line)-2])
score_data={}
label_data = {}
file1 = open("MyTest.txt","r")
for x in file1:
    curr_row=x.split(" ")
    qid=int(curr_row[1].split(":")[1])
    date=float(curr_row[14].split(":")[1])
    index2=int(curr_row[26].split("_")[0])-1
    if qid not in score_data:
        label_data[qid] = {}
        score_data[qid] = []
    label_data[qid][index2+1] = int(curr_row[0])
    score_data[qid].append([0,date,index2+1])
file1.close()
f = open(model+"RankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
#     print(x)
    y = x.split()
    value = y[len(y)-2]
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    # print(i)
    i = i[4]
#     print(q_id+" "+i+" "+value)
    for v in range(len(score_data[int(q_id)])):
        # print(score_data[int(q_id)][v][2])
        # print(i)
#         print(str(score_data[int(q_id)][v][2]) + " " + i)
        if score_data[int(q_id)][v][2] == int(i):
            score_data[int(q_id)][v][0] = float(value)
#             print("volla")
            break
for id in score_data.keys():
    score_data[id].sort(key = lambda sub: (-sub[0], sub[1]))
#     for x in score_data[id]:
#         print(x[2])
#     print("change")
# print(score_data)
tot_ndcg = 0
for id in score_data:
    y_score = []
    y_true = []
    for doc in score_data[id]:
        y_score.append(doc[0])
        y_true.append(label_data[id][doc[2]])
#         print(str(doc[0])+" "+str(label_data[id][doc[2]]))
    print(ndcg(y_true,y_score))
    tot_ndcg+=ndcg(y_true,y_score)
tot_ndcg/=len(label_data)
print(tot_ndcg)

b'\n'
b'Discard orig. features\n'
b'Model file:\trankBoost.txt\n'
b'Feature normalization: zscore\n'


FileNotFoundError: [Errno 2] No such file or directory: 'rankBoostRankedLists.txt'

In [None]:
f = open("lambdaMartRankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
    y = x.split()
#     print(x)
    value = y[len(y)-2]
#     print(value)
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    i = i[4]
    for v in range(len(score_data[int(q_id)])):
        if score_data[int(q_id)][v][2] == int(i):
            score_data[int(q_id)][v][0] = float(value)
            break
#     if q_id not in ans:
#         ans[q_id] = []
#     ans[q_id].append([float(value),int(i)])



In [None]:
for id in score_data.keys():
    score_data[id].sort(key = lambda sub: (-sub[0], sub[1]))

In [None]:
def ndcg(y_true, y_score, k=1500):
        y_true_sorted = sorted(y_true, reverse=True)
        ideal_dcg = 0
        for i in range(min(k,len(y_score))):
            ideal_dcg += (2 ** y_true_sorted[i] - 1.) / np.log2(i + 2)
        dcg = 0
        argsort_indices = np.argsort(y_score)[::-1]
        for i in range(min(k,len(y_score))):
            dcg += (2 ** y_true[argsort_indices[i]] - 1.) / np.log2(i + 2)
        ndcg = dcg / ideal_dcg
        return ndcg

In [None]:
tot_ndcg = 0
for id in score_data:
    y_score = []
    y_true = []
    for doc in score_data[id]:
        y_score.append(doc[0])
        y_true.append(label_data[id][doc[2]])
    print(ndcg(y_true,y_score))
    tot_ndcg+=ndcg(y_true,y_score)
tot_ndcg/=len(label_data)
print(tot_ndcg)

In [None]:
f = open("lambdaMartRankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
    y = x.split()
#     print(x)
    value = y[len(y)-2]
#     print(value)
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    i = i[4]
    if q_id not in ans:
        ans[q_id] = []
    ans[q_id].append([float(value),int(i)])



In [None]:
for key in ans.keys():
#     print(key)
#     print(ans[key])
#     ans[key].sort(reverse = True)
    sorted(ans[key], key = lambda sub: (-sub[0], sub[1]))
    #print(ans[key])
    li = []
    for i in ans[key]:
#         print(label_data[int(key)][int(i[1])])
        if int(i[1]) in label_data[int(key)]:
            label = label_data[int(key)][int(i[1])]
            li.append([i[1],label_data[int(key)][int(i[1])]])
        else :
            print(key + " " + i[1])

    print(li) 
    print("change/n")

In [None]:
# s = 0
# for x in trainsets:
#     s+=len(x)
# print(s)

In [None]:
# import os
# queries=['4 Years of FAN','MayThe4thBeWithYou','once upon a time','Pandemic','Silver Lake']#,'narendra modi','artificial intelligence','coronavirus','pakistan']
# for query in queries:
#     d=pd.read_csv('BTP/data/sample_'+query+'.csv')
#     d=d.drop(['Unnamed: 0'], axis = 1) 
#     os.remove('BTP/data/sample_'+query+'.csv')
#     d.to_csv('BTP/data/sample_'+query+'.csv', index=False)