# <span style='color:Red'> Social Media News Generation </span>

## <span style='color:Blue'>Project By: </span>
### Nisha Keshav Shenvi
### Ravella.Puthali
### Simna Ashraf
### Sneha Shankar Hirnaik

## <span style='color:Blue'> Mentor:</span>
### Alarsh Tiwari 

###### Note: Create a folder named Cluster Output in the same location where this notebook is .
***
***
***
#### <span style='background :yellow' > Importing libraries</span> 


In [1]:
import os
import re
import shutil
import string
import xlsxwriter
import pandas as pd
from pandas import DataFrame
from collections import Counter
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer

#### <span style='background :yellow' > Accessing files</span>

In [2]:
path = os.path.abspath(os.path.dirname('__file__'))
url = os.path.join(path, 'bbchealth.txt')
f = open(url, "r", encoding="utf8")



#### <span style='background :yellow' >Declaration of the variables </span>

In [3]:
tweets = list(f)
Headlines_List = []
pointer = 1
num1 = 0
sr_num = 0
# file that will have final result
url1 = os.path.join(path, 'Cluster_Output\\')
finalresult = url1+'cluster_output.xlsx'
#check if output file already exists and delete it
if os.path.exists(finalresult):
    os.remove(finalresult)

#### <span style='background :yellow' >Data Preprocessing </span>

In [4]:
for i in range(len(tweets)):

        # remove \n from the end after every sentence
        tweets[i] = tweets[i].strip('\n')

        # Remove the tweet id and timestamp
        tweets[i] = tweets[i][50:]
       
        # Remove any URL
        tweets[i] = re.sub(r"http\S+", "", tweets[i])
        tweets[i] = re.sub(r"www\S+", "", tweets[i])
       
        # Remove any hash-tags symbols
        tweets[i] = tweets[i].replace('#', '')

        # Convert every word to lowercase
        tweets[i] = tweets[i].lower()

        # remove punctuations
        tweets[i] = tweets[i].translate(str.maketrans('', '', string.punctuation))

        # trim extra spaces
        tweets[i] = " ".join(tweets[i].split())

#### <span style='background :yellow' > Creating a Dataframe to store prefrocessed tweets </span>

In [5]:
df = DataFrame (tweets,columns=['Headlines'])
dataframe_real = DataFrame (tweets,columns=['Headlines'])
output_df = DataFrame (tweets,columns=['Headlines'])
ac_df = DataFrame (tweets,columns=['Headlines'])

####  <span style='background :yellow' >Feature Extracting using TfidfVectorizer </span>
##### The TF-IDF (term frequency-inverse document frequency) algorithm is based on word statistics for text feature extraction. ... The representation of words needs to extract the similarity of words, and the similarity among words needs to be obtained by the meaning of words in texts.

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
#Fit(): Method calculates the parameters μ and σ and saves them as internal objects.
#Transform(): Method using these calculated parameters apply the transformation to a particular dataset. 
#Fit_transform(): joins the fit() and transform() method for transformation of dataset
X = vectorizer.fit_transform(tweets)

#### <span style='background :yellow' > Settings for final result in Excel </span>

In [7]:
workbook = xlsxwriter.Workbook(finalresult)
worksheet = workbook.add_worksheet()
worksheet.set_column('C:C', 50)
bold = workbook.add_format({'bold': True, 'font_color': "White", 'font_size': 14, 'center_across': True, 'border': True })
bold.set_font_color('White')
bold.set_bg_color('Magenta')
all_cells = workbook.add_format({'bold': False, 'font_size': 13, 'reading_order': True, 'border': True, 'center_across': True, })
last_column = workbook.add_format({'bold': False, 'font_size': 13, 'reading_order': True, 'border': True, 'center_across': True })
last_column.set_text_wrap()
worksheet.write('A1', 'Sr. No.', bold)
worksheet.write('B1', 'Domain', bold)
worksheet.write('C1', 'News', bold)

0

In [8]:
num = df['Headlines'].count()
#print(num)
true_k = (int)(num/2)

#### <span style='background :yellow' > K-Mean Clustering </span>

In [9]:
# k-means
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
df["cluster"] = model.fit_predict(X)
cluster_details = model.fit_transform(X)

#### <span style='background :yellow' >Agglomerative Hierarchical Clustering</span>

In [10]:
ac_true_k = (int)((len(cluster_details))/2)
df["Agglomerative_Clustering"] = ""

In [11]:
# Agglomerative Clustering
#affinity methods:“euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or “precomputed”
hc = AgglomerativeClustering(n_clusters = ac_true_k, affinity = 'euclidean', linkage ='ward')
df["Agglomerative_Clustering"] = hc.fit_predict(cluster_details)


### <span style='color:Green'>**Main :**</span>
##### > Agglomerative_Clustering,
##### > Cosine Similarity
##### > Finding Domains of the Cluster
##### > Displaying News in the Excel sheet

In [12]:
for k in range(0, ac_true_k):
    ac_num = 'Agglomerative_Clustering == '+str(k)
    output_str = 'Cluster '+str(k)+'\\'
    output_url = os.path.join(url1, output_str)
    ac_url = output_url+'Bifurcated News '+str(k)+'.txt'
    if os.path.exists(output_url):
        shutil.rmtree(output_url)
    os.mkdir(output_url)
    file_name = output_url+'News.txt'
    ac_df = df.query(ac_num)
#print(dataframe_real)
    Headlines_List = ac_df['Headlines'].values.tolist()
    for a in range(0, len(Headlines_List)):
        Headlines_List[a]+=". "

#Checking for Cosine Similarity in Agglomerative Clusters
    for i in range(len(Headlines_List)):
   # print('\n\n\n',Headlines_List[X])
        j=i+1
        while j<len(Headlines_List):
            #+print(i,"\t",j,"\t",len(Headlines_List))
       
            X=Headlines_List[i]
            Y=Headlines_List[j]
       
            #print(X,"\n",Y)
       
            X_list = word_tokenize(X)
            Y_list = word_tokenize(Y)
       
        # sw contains the list of stopwords
            sw = stopwords.words('english')
            l1 =[];l2 =[]

        # remove stop words from string
            X_set = {w for w in X_list if not w in sw}
            Y_set = {w for w in Y_list if not w in sw}
        # form a set containing keywords of both strings
            rvector = X_set.union(Y_set)
            for w in rvector:
                if w in X_set: l1.append(1) # create a vector
                else: l1.append(0)
                if w in Y_set: l2.append(1)
                else: l2.append(0)
               
            c = 0

        # cosine formula
            for z in range(len(rvector)):
                c+= l1[z]*l2[z]
            cosine = c / float((sum(l1)*sum(l2))**0.5)
            if(cosine>0.7):
                del Headlines_List[j]
       
            j=j+1
            #end of while loop
   
    num = 0
    for s in Headlines_List:
            num += 1
            with open(file_name, "a") as text_file:
                text_file.write("News "+str(num)+": "+s+"\n")
                text_file.close()
               
   
   
    dataframe_real = pd.DataFrame(Headlines_List, columns=['Headlines'])
    lines_List = dataframe_real['Headlines'].values.tolist()
    dataframe_real['cluster'] = ""
   
    Z = vectorizer.fit_transform(lines_List)
    terms = vectorizer.get_feature_names()
   
    #code to find News Domain
    def count(fname, words_list):
        full_text = fname
        count_result = dict()
        for word in words_list:
            for text in full_text:
                if word in count_result:
                    count_result[word] = count_result[word] + text.count(word)
                else:
                    count_result[word] = text.count(word)
        return count_result
   


    counter = count(lines_List, terms)
    max = 0
    word = ""
    for i in counter:
        if(max<counter[i]):
            max=counter[i]
            word=i
   
    for i in range(len(dataframe_real.index)):
        dataframe_real.iat[i, 1]=word
    cluster_list=dataframe_real['cluster'].values.tolist()
   
    o=0
  # Writing data into Result Excel sheet 
    for item in range(len(dataframe_real.index)):
        sr_num += 1
        worksheet.write(item+pointer, 0, sr_num, all_cells)
        worksheet.write(item+pointer, 1, cluster_list[item], all_cells)
        worksheet.write(item+pointer, 2, lines_List[item], last_column)  
        o=item  
    pointer += o+1


workbook.close()

In [13]:
print("Task Completed!")

Task Completed!


>>>>>>>> #  <span style='color:Yellow'><span style='background :Blue'> ***Thank You*** </span></span> 