In [53]:
import xml.etree.ElementTree as ET
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import  re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity

In [14]:

tree = ET.parse('enwiki-20210101-pages-articles-multistream12.xml-p8554860p9172788')
root = tree.getroot()

titles = []
texts = []
ids = []

ns = {'mediawiki': 'http://www.mediawiki.org/xml/export-0.10/'}
for child in root.findall('mediawiki:page', ns):
    title = child.find('mediawiki:title', ns)
    identifier = child.find('mediawiki:id', ns)
    titles.append(title.text)
    ids.append(identifier.text)
    for revision in child.findall('mediawiki:revision', ns):
        text_data = revision.find('mediawiki:text', ns)
        if text_data != None:
            texts.append(text_data.text)
        else:
            texts.append(None)

            # Create data frame with elements

dataframe = pd.DataFrame(data={'Title': titles, 'ID': ids, 'Text': texts})
dataframe.head(5)

Unnamed: 0,Title,ID,Text
0,Chestnut Ridge Middle School,8554860,#REDIRECT[[Washington Township Public School D...
1,Colegio de Santa Cruz de Tlatelolco,8554864,{{Infobox university\n|name = Col...
2,Template:US-gov-bio-stub,8554865,{{asbox\n| image = Great Seal of the Unite...
3,Impractical joker (garfield),8554867,#REDIRECT [[List of Garfield and Friends episo...
4,File:The Imperial Dowager Empress Yehenara.PNG,8554869,== Summary ==\nhttp://guangxu.netor.com/galler...


In [15]:
print(titles)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Preprocessing the Data

<ol>    
    <li> Remove tags like</li>

    <li>Remove Urls</li>
    
    <li>Remove Punctuations</li>
    
    <li>Remove stop words</li>
    
    <li>Remove numbers</li>
</ol>

In [16]:
#Remove html tags 
def tags_Rem(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext
#Remove urls
def url_Rem(text):
    text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
    return text
#Remove punctuations 
#THis function takes a string as input, removes the punctuation and return it back 
def Rem_Punc(text):
        text = re.sub(r'[^\w\s]','',text)
        return text
##Remove \n
def newline_remove(text):
    text.replace('\n', '')
    return text##remove stop words 

def stop_words(text):
    

    stop_words = set(stopwords.words('english')) 

    word_tokens = word_tokenize(text) 

    filtered_text = [w for w in word_tokens if not w in stop_words] 

    filtered_text = ""

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_text = filtered_text + " "+w
    return filtered_text
 



In [17]:
def pre_process(text):
    text = tags_Rem(text)
    text = url_Rem(text)
    text = Rem_Punc(text)
    text = newline_remove(text)
    text = stop_words(text)
    return text""
    
    

In [18]:
### drop  empty  pages 
drop_lines = 'Portal|File|Category|JPG|PNG|jpg|Wikipedia|Template'
dataframe = dataframe[~dataframe.Title.str.contains(drop_lines)]
dataframe = dataframe.dropna().reset_index()
del dataframe['index']"""

In [19]:
##Save the preprocessed dataset
titles_ = []
ids_ = []
text_ = []
for index, row in dataframe.iterrows():
    #print(type(row["Text"]))
    text =pre_process( str(row["Text"]))
    text_.append(text)
    ids_.append(row["ID"])
    titles_.append(row["Title"])"""

In [20]:
##Read the dataset
dataframe_processed = pd.DataFrame(data={'Title': titles_, 'ID': ids_, 'Text': text_})
dataframe_processed.to_csv("processed_data", sep='\t')


### TD_IDF Representation 

In [4]:
tf=TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')


In [14]:
#read the data
dataframe_processed = pd.read_csv("processed_data",sep = "\t") 
##reduce tha data


In [42]:
#reduce the dataset 
#dataframe_processed_reduced = dataframe_processed.iloc[:1000]

In [43]:
#drop the extra column
dataframe_processed.pop("Unnamed: 0")

0        0
1        1
2        2
3        3
4        4
      ... 
995    995
996    996
997    997
998    998
999    999
Name: Unnamed: 0, Length: 1000, dtype: int64

In [45]:
#comput the tfidf matrix
tfidf_matrix = tf.fit_transform(dataframe_processed['Text'])


In [46]:
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [47]:
cosine_similarities

array([[1.        , 0.01173054, 0.        , ..., 0.        , 0.        ,
        0.00959829],
       [0.01173054, 1.        , 0.00120294, ..., 0.        , 0.        ,
        0.00857503],
       [0.        , 0.00120294, 1.        , ..., 0.00778287, 0.0114735 ,
        0.0123577 ],
       ...,
       [0.        , 0.        , 0.00778287, ..., 1.        , 0.01734471,
        0.        ],
       [0.        , 0.        , 0.0114735 , ..., 0.01734471, 1.        ,
        0.        ],
       [0.00959829, 0.00857503, 0.0123577 , ..., 0.        , 0.        ,
        1.        ]])

In [48]:
results = {}


In [49]:
for idx, row in dataframe_processed.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], dataframe_processed['Title'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['Title']] = similar_items[1:]

In [50]:
def item(id):
    return dataframe_processed_reduced.loc[dataframe_processed['Title'] == id]['Title'].tolist()[0].split(' - ')[0]

In [51]:
def recommend(item_id, num):
    print("Recommending " + str(num) + " links similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + 'https://en.wikipedia.org/wiki/'+item(rec[1]).replace(" ", "_") + " (score:" + str(rec[0]) + ")")
        print("========================================================")

In [None]:
link = input ("Enter a wiki link: ") 
print(link[30:])
print(link)
string= str(link[30:]).replace("_", " ")
#string = str(link)
string= "".join([x for x in string]).split(" ")
string = " ".join(string)
string = string.lstrip()
print(string)

In [52]:
recommend(item_id=string, num=10)

Recommending 5 products similar to Colegio de Santa Cruz de Tlatelolco...
-------
Recommended: Albuquerque Trivia (score:0.0666828982847375)
Recommended: Ewing Young (score:0.054236226289949695)
Recommended: List of points of interest in Albuquerque, New Mexico (score:0.05273692419193209)
Recommended: Sisters' college (score:0.040509451026999745)
Recommended: Richard John Garcia (score:0.03711085198111095)
