In [1]:
import pandas as pd
import re
from random import choice

In [2]:
def loadAndProcessData(filename):
    tweet_df = pd.read_csv(filename, sep="|", header=None, names=["ID", "TIMESTAMP", "TWEET"],encoding= 'unicode_escape')
    tweet_df.drop(['TIMESTAMP'], axis = 1, inplace = True)
    # Removing the links from the tweets 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'https?://[^ ]+', '', tweet) )
    # Removing the username from the tweets 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'@[^ ]+', '', tweet) )
    # Removing the # from the tweets 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'#', '', tweet) )
    # Removing the special charaters and number
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'[^A-Za-z ]', '', tweet) )
    # Convert All the text to lowercase 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet :  tweet.lower() )
    # Normalize the data 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'([A-Za-z])\1{2,}', r'\1', tweet) )
    # remove rt which stands for re tweet 
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet : re.sub(r'^rt ', '', tweet) )
    #converting the tweet to set of words
    tweet_df['TWEET']=tweet_df['TWEET'].apply(lambda tweet: set(str(tweet).split()))
    return tweet_df

In [3]:
def getScore(oneTweet,twoTweet):
    inter = list(oneTweet & twoTweet)
    I = len(inter)
    union = list(oneTweet | twoTweet)
    U = len(union)
    return round(1 - (float(I) / U), 4)

In [4]:
def getIntialCenteriods(tweet_dic,k):
    tweet_score=[]
    for key in tweet_dic:
        currentScore=0
        for ele in tweet_dic:
            if key!=ele:
                currentScore+=getScore(tweet_dic[key],tweet_dic[ele])
        tweet_score.append([currentScore,key])
    tweet_score=sorted(tweet_score, key=lambda x: x[0])
    centeriods={}
    for i in range(int(len(tweet_score)/2-int(k/2)),int(len(tweet_score)/2)+int(k/2)+1):
        centeriods[tweet_score[i][1]]=[]
    return centeriods
    

In [5]:
def assignCluster(tweet_dic,centertiods):
    for tweet in tweet_dic:
        min_Tweet=[1.1,None]
        for center in centertiods:
            current=getScore(tweet_dic[tweet],tweet_dic[center])
            if current<min_Tweet[0]:
                min_Tweet=[current,center]
        centertiods[min_Tweet[1]].append(tweet)
    return centertiods

In [6]:
def oneCenter(listOfTweet):
    tweet_score=[]
    len_Tweet=len(listOfTweet)
    for key in listOfTweet:
        currentScore=0
        for ele in listOfTweet:
            if key!=ele:
                currentScore+=getScore(tweet_dic[key],tweet_dic[ele])
        tweet_score.append([currentScore,key])
    tweet_score=sorted(tweet_score, key=lambda x: x[0])
    return tweet_score[0][1]

In [7]:
def SSE(centeriods):
    distance=0
    for center in centeriods:
        for tweet in centeriods[center]:
            distance+=getScore(tweet_dic[center],tweet_dic[tweet])
    return round(distance,2)

In [8]:
#main
tweet=loadAndProcessData('usnewshealth.txt')
tweet_dic=dict(zip(tweet['ID'], tweet['TWEET']))
centeriods={}
k_Value=[5,10,15,20]
for k in k_Value:
    print("Value of K: {}".format(k))
    count=0
    while count<k:
        key=choice(list(tweet_dic.keys()))
        if key not in centeriods:
            centeriods[key]=[]
            count+=1
    #print(centeriods)
    #centeriods=getIntialCenteriods(tweet_dic,10)
    preCenteriods=set()
    currentCenteriods=set(centeriods.keys())
    centeriods=assignCluster(tweet_dic,centeriods)
    count=1
    while not(preCenteriods == currentCenteriods):
        #print(count)
        count+=1
        preCenteriods=currentCenteriods.copy()
        newCenteriods={}
        for center in centeriods:
            newCenter=oneCenter(centeriods[center])
            newCenteriods[newCenter]=[]
        newCenteriods=assignCluster(tweet_dic,newCenteriods)
        centeriods=newCenteriods.copy()
        currentCenteriods=set(centeriods.keys())
    count=0
    #print(centeriods)
    sse_value=SSE(centeriods)

    print("SSE value: {}".format(sse_value))
    for center in centeriods:
        count+=1
        print(" Size of cluster {} : {}".format(count,len(centeriods[center])))

Value of K: 5
SSE value: 1242.28
 Size of cluster 1 : 272
 Size of cluster 2 : 209
 Size of cluster 3 : 154
 Size of cluster 4 : 580
 Size of cluster 5 : 180
Value of K: 10
SSE value: 1192.26
 Size of cluster 1 : 153
 Size of cluster 2 : 114
 Size of cluster 3 : 104
 Size of cluster 4 : 277
 Size of cluster 5 : 99
 Size of cluster 6 : 45
 Size of cluster 7 : 159
 Size of cluster 8 : 65
 Size of cluster 9 : 97
 Size of cluster 10 : 31
 Size of cluster 11 : 6
 Size of cluster 12 : 32
 Size of cluster 13 : 77
 Size of cluster 14 : 63
 Size of cluster 15 : 73
Value of K: 15
SSE value: 1156.07
 Size of cluster 1 : 86
 Size of cluster 2 : 75
 Size of cluster 3 : 72
 Size of cluster 4 : 148
 Size of cluster 5 : 69
 Size of cluster 6 : 35
 Size of cluster 7 : 98
 Size of cluster 8 : 53
 Size of cluster 9 : 67
 Size of cluster 10 : 26
 Size of cluster 11 : 3
 Size of cluster 12 : 26
 Size of cluster 13 : 50
 Size of cluster 14 : 46
 Size of cluster 15 : 48
 Size of cluster 16 : 30
 Size of clus