In [3]:
import numpy as np
import pandas as pd

In [4]:
#text cleaning code
import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
#word U.S. and can't are getting in bad form after cleanup
def clean_tweets(tweet,remove_stopword,remove_punctuation,remove_emoticons):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if ((word not in stopwords_english or (not remove_stopword)) and # remove stopwords
              (word not in emoticons or (not remove_emoticons)) and # remove emoticons
                word not in string.punctuation or (not remove_punctuation)): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean
    

In [5]:
import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def text_to_vector(text):
     words = WORD.findall(text)
     return Counter(words)

def no_characters(words):
  chars=0
  for word in words:
    chars=chars+len(word)
  return chars

def cosine(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def jaccard(tweet,query):
    vec1 = text_to_vector(tweet.lower())
    vec2 = text_to_vector(query.lower())
    intersection = set(vec1.keys()) & set(vec2.keys())
    union =set(vec1.keys()) | set(vec2.keys())
    return len(intersection)/len(union)

def dice(tweet,query):
  vec1 = text_to_vector(tweet.lower())
  vec2 = text_to_vector(query.lower())
  intersection = set(vec1.keys()) & set(vec2.keys())
  return 2*len(intersection)/(len(vec1)+len(vec2))

In [6]:
# https://towardsdatascience.com/tfidf-for-piece-of-text-in-python-43feccaa74f8
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import math

def count_words(tweet):
  x=word_tokenize(tweet)
  print(x)
  return len(x)

def get_doc(docs):
  doc_info=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    count=len(clean_tweets(doc,False,False,True))
    doc_info.append({'doc_id':i,'query':query,'doc_length':count})
  return doc_info

def create_freq_dict(docs):
  freqDist_list=[]
  i=0
  for doc,query in docs[['cleaned_tweet','cleaned_query']].itertuples(index=False):
    i=i+1
    freq_dict={}
    words=clean_tweets(doc,False,False,True)
    for word in words:
      if word in freq_dict:
        freq_dict[word]+=1
      else:
        freq_dict[word]=1
    freqDist_list.append({'doc_id':i,'query':query,'freq_dict':freq_dict})
  return freqDist_list
def computeTF(doc_info, freqDict_list):
  TF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    tf_score={}
    for key in tempDict['freq_dict']:
      tf_score[key]=tempDict['freq_dict'][key]/doc_info[id-1]['doc_length']
    temp={'doc_id':id,'query':query,'TF_score':tf_score}
    TF_scores.append(temp)
  return TF_scores

def computeIDF(doc_info, freqDict_list):
  IDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    idf_score={}
    for key in tempDict['freq_dict'].keys():
      count=sum([key in tempDict['freq_dict'] for tempDict in freqDict_list])
      idf_score[key]=math.log(len(doc_info)/count)
    IDF_scores.append({'doc_id':id,'query':query,'IDF_score':idf_score})
  return IDF_scores

def TFIDF_similarity(TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, clean_tweets,tweets):
  TFIDF_scores=[]
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    sumOfSquareWeights=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        sumOfSquareWeights=sumOfSquareWeights+idf_t**2
        score=score+(math.sqrt(tf_t_d))*(idf_t**2)*doc_info[id-1]['doc_length']
    coord_factor_q_d=cosine_similarity[id-1]
    # queryNorm=1/math.sqrt(sumOfSquareWeights)#sometimes cause float division error
    score=score*coord_factor_q_d#*queryNorm
    TFIDF_scores.append(score)
  return TFIDF_scores

def Okapi_BM25(k,b,TF_scores, IDF_scores, doc_info, freqDict_list, cosine_similarity, tweet_lengths,clean_tweets,tweets):
  OKAPI_scores=[]
  average_doc_length=tweet_lengths.sum()/len(tweet_lengths)
  for tempDict in freqDict_list:
    id=tempDict['doc_id']
    query=tempDict['query']
    score=0
    for key in tempDict['freq_dict']:
      if key in query.split(" "):
        tf_t_d=TF_scores[id-1]['TF_score'][key]
        idf_t=IDF_scores[id-1]['IDF_score'][key]
        numerator=(k+1)*tf_t_d
        denominator=k*(1-b+b*(tweet_lengths[id-1]/average_doc_length))+tf_t_d
        score=score+idf_t*(numerator/denominator)
    OKAPI_scores.append(score)
  return OKAPI_scores

In [7]:
# encoding: utf-8 
import csv
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
queries=['narendra modi','artificial intelligence','coronavirus','pakistan','INDvAUS']
trainsets=[]
for query in queries:
    d=pd.read_csv('data/'+query+'.csv')
    
    d['cleaned_tweet']=[" ".join(clean_tweets(tweet,True,True,True)) for tweet in d['tweet']]
    d['cleaned_query']=[" ".join(clean_tweets(query,True,True,True)) for query in d['Query']]                             
    d['rank']=[d.shape[0]-i for i in range(0,d.shape[0])]
    d['how_old']=[(datetime.now()-datetime.strptime(date, '%I:%M %p · %d %b %Y')).total_seconds() for date in d['date']]
    d['cosine']=[cosine(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['jaccard']=[jaccard(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['url_bool']=[(0 if i==0 else 1) for i in d['url_count']]
    d['hashtag_count']=[(0 if i is np.nan else len(i.split(','))) for i in d['tags']]
    d['hashtag_bool']=[(0 if i==0 else 1) for i in d['hashtag_count']]
    d['dice']=[dice(tweet,query)*100 for tweet,query in d[['cleaned_tweet','cleaned_query']].itertuples(index=False)]
    d['word_count']=[len(tweet.split(' ')) for tweet in d['tweet']]
    d['char_count']=[no_characters(tweet.split(' ')) for tweet in d['tweet']]
    d['follower_friend_relation']=[0 if created_on is np.nan else 100*max(1,followers-friends)/(datetime.now()-datetime.strptime(created_on, '%Y-%m-%d %H:%M:%S')).total_seconds() for followers,friends,created_on in d[['followers_count','friends_count','created_at']].itertuples(index=False)]
    
    doc_info=get_doc(d[['cleaned_tweet','cleaned_query']])
    freqDict_list=create_freq_dict(d[['cleaned_tweet','cleaned_query']])
    TF_scores = computeTF(doc_info,freqDict_list)
    IDF_scores=computeIDF(doc_info,freqDict_list)
    TFIDF_scores=TFIDF_similarity(TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['cleaned_tweet'],d['tweet'])
    d['tfidf_similarity']=TFIDF_scores
    OKAPI_scores=Okapi_BM25(0.75,0.5,TF_scores,IDF_scores,doc_info,freqDict_list,d['cosine'],d['word_count'],d['cleaned_tweet'],d['tweet'])
    d['okapi']=OKAPI_scores
    numerical_features=d[['followers_count','friends_count','listed_count','likes','comments','retweets','sum_followers_mention','url_count','how_old','cosine','jaccard','hashtag_count','dice','word_count','char_count','follower_friend_relation','tfidf_similarity','okapi']]

    #to  scale features
#     numerical_features=pd.DataFrame(scaler.fit_transform(numerical_features), columns=numerical_features.columns)
    
    data=pd.concat([d[['Query','rank','verified','Img_present','url_bool','hashtag_bool']],numerical_features],axis=1)
    trainsets.append(data)
    print(data.columns) 


Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'how_old', 'cosine',
       'jaccard', 'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_count', 'how_old', 'cosine',
       'jaccard', 'hashtag_count', 'dice', 'word_count', 'char_count',
       'follower_friend_relation', 'tfidf_similarity', 'okapi'],
      dtype='object')
Index(['Query', 'rank', 'verified', 'Img_present', 'url_bool', 'hashtag_bool',
       'followers_count', 'friends_count', 'listed_count', 'likes', 'comments',
       'retweets', 'sum_followers_mention', 'url_co

In [40]:
import subprocess
import numpy as np
model = "randomForest"
cmd=["java","-jar","RankLib-2.13.jar","-rank","MyTest.txt","-load",model+".txt","-norm","zscore","-indri",model+"RankedLists.txt"]
output=[]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
for line in proc.stdout.readlines():
    print(line)
    output.append(line[:len(line)-2])
score_data={}
label_data = {}
file1 = open("MyTest.txt","r")
for x in file1:
    curr_row=x.split(" ")
    qid=int(curr_row[1].split(":")[1])
    date=float(curr_row[14].split(":")[1])
    index2=int(curr_row[26].split("_")[0])-1
    if qid not in score_data:
        label_data[qid] = {}
        score_data[qid] = []
    label_data[qid][index2+1] = int(curr_row[0])
    score_data[qid].append([0,date,index2+1])
file1.close()
f = open(model+"RankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
#     print(x)
    y = x.split()
    value = y[len(y)-2]
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    # print(i)
    i = i[4]
#     print(q_id+" "+i+" "+value)
    for v in range(len(score_data[int(q_id)])):
        # print(score_data[int(q_id)][v][2])
        # print(i)
#         print(str(score_data[int(q_id)][v][2]) + " " + i)
        if score_data[int(q_id)][v][2] == int(i):
            score_data[int(q_id)][v][0] = float(value)
#             print("volla")
            break
for id in score_data.keys():
    score_data[id].sort(key = lambda sub: (-sub[0], sub[1]))
    for x in score_data[id]:
        print(x[2])
    print("change")
# print(score_data)
tot_ndcg = 0
for id in score_data:
    y_score = []
    y_true = []
    for doc in score_data[id]:
        y_score.append(doc[0])
        y_true.append(label_data[id][doc[2]])
#         print(str(doc[0])+" "+str(label_data[id][doc[2]]))
    print(ndcg(y_true,y_score))
    tot_ndcg+=ndcg(y_true,y_score)
tot_ndcg/=len(label_data)
print(tot_ndcg)

b'\r\n'
b'Discard orig. features\r\n'
b'Model file:\trandomForest.txt\r\n'
b'Feature normalization: zscore\r\n'
b'Model:\t\tRandom Forests\r\n'
b'\rReading feature file [MyTest.txt]: 0... \rReading feature file [MyTest.txt]... [Done.]            \r\n'
b'(4 ranked lists, 4240 entries read)\r\n'
39
21
40
13
4
84
24
55
41
12
5
56
7
23
44
72
57
48
42
10
81
92
65
82
14
16
88
20
54
70
45
25
49
38
53
77
100
46
61
26
76
47
90
19
28
3
101
73
66
18
69
8
30
1
35
62
36
86
27
59
11
15
99
91
75
98
104
22
103
95
114
71
68
51
29
43
17
85
31
58
93
2
9
34
74
115
102
78
33
50
60
80
37
67
87
94
83
6
79
96
63
32
89
116
64
52
105
111
113
109
108
106
97
107
123
110
112
117
119
118
124
121
120
122
127
125
131
136
126
133
134
137
130
139
138
128
135
132
143
145
140
129
142
150
147
146
141
149
144
151
148
152
155
154
156
153
157
158
159
160
165
164
162
163
161
170
168
173
174
181
172
166
177
175
169
196
179
187
185
184
186
189
176
171
183
197
180
195
167
178
193
201
188
182
192
200
194
199
190
198
191
215
214
2

1557
1328
1362
1444
1376
1308
1441
1358
1422
1354
1433
1352
1397
1409
1372
1423
1318
1345
1411
1442
1437
1548
1420
1307
1351
1383
1357
1426
1405
1363
1417
1412
1408
1401
1430
1804
1337
1378
1316
1346
1347
1790
1326
1732
1369
1370
1394
1448
1374
1388
1334
1402
1333
1339
1309
1348
1419
1400
1428
1384
1371
1396
1436
1431
1338
1361
1392
1421
1385
1341
1322
1353
1424
1332
1717
1389
1386
1387
1373
1364
1377
1330
1340
1356
1410
1349
1323
1342
1379
1742
1344
1432
1381
1438
1601
1595
1416
1477
1399
1403
1414
1360
1443
1380
1520
1449
1585
1368
1413
1425
1375
1571
1740
1536
1620
1324
1568
1407
1404
1435
1415
1447
1355
1418
1446
1395
1715
1398
1494
1393
1450
1367
1434
1366
1506
1453
1427
1460
1325
1639
1314
1454
1455
1445
1452
1331
1327
1382
1365
1701
1335
1623
1722
1457
1456
1451
1517
1543
1655
1628
1789
1483
1504
1470
1679
1480
1458
1488
1469
1516
1668
1670
1478
1462
1695
1576
1631
1465
1611
1471
1485
1522
1487
1781
1510
1502
1565
1472
1507
1459
1481
1467
1501
1496
1495
1581
1463
1793
1550
1505


In [27]:
file1 = open("MyFile.txt","w")
label_data = {}
score_data = {}
# for query in queries:
#     label_data[query] = {}
for index,trainset in enumerate(trainsets):
    divider=math.ceil(len(trainset)/10)
    for index2,row in enumerate(trainset.values):
        s=""
        s=s+str(min(9,int(row[1]//divider)))
        s=s+" "+"qid:"+str(index+1)
        for j,data in enumerate(row[2:]):
            s=s+" "+str(j+1)+":"+str(data)
            if j+1 == 13:
                date = data
        s=s+" "+"#docid = "+str(index2+1)+"_of_"+row[0]
        s=s+"\n"
        if index+1 not in label_data:
            label_data[index+1] = {}
            score_data[index+1] = []
        label_data[index+1][index2+1] = min(9,int(row[1]//divider))
        score_data[index+1].append([0,date,index2+1])
#         print(s)
        file1.write(s)
#         if index2>5:
#             break
file1.close()


In [28]:
score_data[1]

[[0, 4000571.540051, 1],
 [0, 4003991.541049, 2],
 [0, 4003871.541049, 3],
 [0, 4003871.541049, 4],
 [0, 4003511.541049, 5],
 [0, 3995351.541049, 6],
 [0, 3995531.541049, 7],
 [0, 3991331.541049, 8],
 [0, 3996311.541049, 9],
 [0, 3997571.541049, 10],
 [0, 3995891.541049, 11],
 [0, 3997931.541049, 12],
 [0, 4000631.541049, 13],
 [0, 3991811.541049, 14],
 [0, 3993431.541049, 15],
 [0, 3992411.541049, 16],
 [0, 3992411.541049, 17],
 [0, 3991271.541049, 18],
 [0, 3997031.541049, 19],
 [0, 3996251.541049, 20],
 [0, 3995231.541049, 21],
 [0, 3992291.541049, 22],
 [0, 3993431.541049, 23],
 [0, 3999371.541049, 24],
 [0, 4003511.541049, 25],
 [0, 4002371.541049, 26],
 [0, 4002371.541049, 27],
 [0, 4002371.541049, 28],
 [0, 3999671.541049, 29],
 [0, 3995471.541049, 30],
 [0, 4000871.541049, 31],
 [0, 4004471.542046, 32],
 [0, 3991511.542046, 33],
 [0, 3991691.542046, 34],
 [0, 3991811.542046, 35],
 [0, 3991871.542046, 36],
 [0, 3992051.542046, 37],
 [0, 3991811.542046, 38],
 [0, 3995351.542046, 

In [13]:
print(label_data[1][1])

[0, 9]


In [75]:
a = [1,2,3]
len(a)

3

In [36]:
f = open("lambdaMartRankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
    y = x.split()
#     print(x)
    value = y[len(y)-2]
#     print(value)
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    i = i[4]
    for v in range(len(score_data[int(q_id)])):
        if score_data[int(q_id)][v][2] == int(i):
            score_data[int(q_id)][v][0] = float(value)
            break
#     if q_id not in ans:
#         ans[q_id] = []
#     ans[q_id].append([float(value),int(i)])



In [67]:
for id in score_data.keys():
    score_data[id].sort(key = lambda sub: (-sub[0], sub[1]))

In [34]:
def ndcg(y_true, y_score, k=1500):
        y_true_sorted = sorted(y_true, reverse=True)
        ideal_dcg = 0
        for i in range(min(k,len(y_score))):
            ideal_dcg += (2 ** y_true_sorted[i] - 1.) / np.log2(i + 2)
        dcg = 0
        argsort_indices = np.argsort(y_score)[::-1]
        for i in range(min(k,len(y_score))):
            dcg += (2 ** y_true[argsort_indices[i]] - 1.) / np.log2(i + 2)
        ndcg = dcg / ideal_dcg
        return ndcg

In [17]:
len(label_data)

5

In [72]:
tot_ndcg = 0
for id in score_data:
    y_score = []
    y_true = []
    for doc in score_data[id]:
        y_score.append(doc[0])
        y_true.append(label_data[id][doc[2]])
    print(ndcg(y_true,y_score))
    tot_ndcg+=ndcg(y_true,y_score)
tot_ndcg/=len(label_data)
print(tot_ndcg)

0.9823436238283647
0.9266135450764893
0.9870200957947841
0.5668820119679532
0.9014029232453312
0.8728524399825845


In [85]:
f = open("lambdaMartRankedLists.txt","r")
f1 = f.readlines()
ans = {}
for x in f1:
    y = x.split()
#     print(x)
    value = y[len(y)-2]
#     print(value)
    q_id = y[0]
    z = x.split('_')
    i = z[0]
    i = i.split()
    i = i[4]
    if q_id not in ans:
        ans[q_id] = []
    ans[q_id].append([float(value),int(i)])



In [86]:
for key in ans.keys():
#     print(key)
#     print(ans[key])
#     ans[key].sort(reverse = True)
    sorted(ans[key], key = lambda sub: (-sub[0], sub[1]))
    #print(ans[key])
    li = []
    for i in ans[key]:
#         print(label_data[int(key)][int(i[1])])
        if int(i[1]) in label_data[int(key)]:
            label = label_data[int(key)][int(i[1])]
            li.append([i[1],label_data[int(key)][int(i[1])]])
        else :
            print(key + " " + i[1])

    print(li) 
    print("change/n")

[[1, 9], [2, 9], [3, 9], [4, 9], [5, 9], [6, 9], [7, 9], [14, 9], [18, 9], [26, 9], [27, 9], [28, 9], [31, 9], [43, 9], [82, 9], [93, 9], [94, 9], [101, 9], [8, 9], [9, 9], [10, 9], [11, 9], [12, 9], [13, 9], [15, 9], [16, 9], [17, 9], [19, 9], [20, 9], [21, 9], [22, 9], [23, 9], [24, 9], [25, 9], [29, 9], [30, 9], [32, 9], [33, 9], [34, 9], [35, 9], [36, 9], [37, 9], [38, 9], [39, 9], [40, 9], [41, 9], [42, 9], [44, 9], [45, 9], [46, 9], [47, 9], [48, 9], [49, 9], [50, 9], [51, 9], [52, 9], [53, 9], [54, 9], [55, 9], [56, 9], [57, 9], [58, 9], [59, 9], [60, 9], [61, 9], [62, 9], [63, 9], [64, 9], [65, 9], [66, 9], [67, 9], [68, 9], [69, 9], [70, 9], [71, 9], [72, 9], [73, 9], [74, 9], [75, 9], [76, 9], [77, 9], [78, 9], [79, 9], [80, 9], [81, 9], [83, 9], [84, 9], [85, 9], [86, 9], [87, 9], [88, 9], [89, 9], [90, 9], [91, 9], [92, 9], [95, 9], [96, 9], [97, 9], [98, 9], [99, 9], [100, 9], [102, 9], [103, 9], [104, 9], [105, 9], [106, 9], [107, 9], [108, 9], [109, 9], [110, 9], [111, 9