In [90]:
import txtclean as tc
import clean_text_regex
import cosine
import euclidean_distance
import jaccard
import levenshtein
import wordoverlapratio
paragraph = '''Data structure where data elements are arranged sequentially or linearly where each and every element is attached to its previous and next adjacent is called a linear data structure. In linear data structure, single level is involved. Therefore, we can traverse all the elements in single run only. Linear data structures are easy to implement because computer memory is arranged in a linear way. Its examples are array, stack, queue, linked list, etc. 
1. Array

The array is a type of data structure that stores elements of the same type. These are the most basic and fundamental data structures. Data stored in each position of an array is given a positive value called the index of the element. The index helps in identifying the location of the elements in an array.

If supposedly we have to store some data i.e. the price of ten cars, then we can create a structure of an array and store all the integers together. This doesn’t need creating ten separate integer variables. Therefore, the lines in a code are reduced and memory is saved. The index value starts with 0 for the first element in the case of an array.'''

In [91]:
#Step 1: text cleaning -> punctuation removal,special characters and numbers removal
content = tc.clean_text(paragraph)
print("CONTENT AFTER STEP 1 PUNCTUATION REMOVAL: ",content)

CONTENT AFTER STEP 1 PUNCTUATION REMOVAL:  data structure where data elements are arranged sequentially or linearly where each and every element is attached to its previous and next adjacent is called a linear data structure in linear data structure single level is involved therefore we can traverse all the elements in single run only linear data structures are easy to implement because computer memory is arranged in a linear way its examples are array stack queue linked list etc  array the array is a type of data structure that stores elements of the same type these are the most basic and fundamental data structures data stored in each position of an array is given a positive value called the index of the element the index helps in identifying the location of the elements in an array if supposedly we have to store some data i e the price of ten cars then we can create a structure of an array and store all the integers together this doesn t need creating ten separate integer variables 

In [92]:
#Step 2: tokenization -> split text to words or tokens
word = tc.tokenize_text(content)
print("CONTENT AFTER STEP 2 TOKENIZE: ",word)

CONTENT AFTER STEP 2 TOKENIZE:  ['data', 'structure', 'where', 'data', 'elements', 'are', 'arranged', 'sequentially', 'or', 'linearly', 'where', 'each', 'and', 'every', 'element', 'is', 'attached', 'to', 'its', 'previous', 'and', 'next', 'adjacent', 'is', 'called', 'a', 'linear', 'data', 'structure', 'in', 'linear', 'data', 'structure', 'single', 'level', 'is', 'involved', 'therefore', 'we', 'can', 'traverse', 'all', 'the', 'elements', 'in', 'single', 'run', 'only', 'linear', 'data', 'structures', 'are', 'easy', 'to', 'implement', 'because', 'computer', 'memory', 'is', 'arranged', 'in', 'a', 'linear', 'way', 'its', 'examples', 'are', 'array', 'stack', 'queue', 'linked', 'list', 'etc', 'array', 'the', 'array', 'is', 'a', 'type', 'of', 'data', 'structure', 'that', 'stores', 'elements', 'of', 'the', 'same', 'type', 'these', 'are', 'the', 'most', 'basic', 'and', 'fundamental', 'data', 'structures', 'data', 'stored', 'in', 'each', 'position', 'of', 'an', 'array', 'is', 'given', 'a', 'positi

In [93]:
#Step 3: stop word removal 
word = tc.stopwordremove(word)
# word = ' '.join()
print("CONTENT AFTER STEP 3 STOPWORD REMOVAL: ",word)

CONTENT AFTER STEP 3 STOPWORD REMOVAL:  ['data', 'structure', 'data', 'elements', 'arranged', 'sequentially', 'linearly', 'every', 'element', 'attached', 'previous', 'next', 'adjacent', 'called', 'linear', 'data', 'structure', 'linear', 'data', 'structure', 'single', 'level', 'involved', 'therefore', 'traverse', 'elements', 'single', 'run', 'linear', 'data', 'structures', 'easy', 'implement', 'computer', 'memory', 'arranged', 'linear', 'way', 'examples', 'array', 'stack', 'queue', 'linked', 'list', 'etc', 'array', 'array', 'type', 'data', 'structure', 'stores', 'elements', 'type', 'basic', 'fundamental', 'data', 'structures', 'data', 'stored', 'position', 'array', 'given', 'positive', 'value', 'called', 'index', 'element', 'index', 'helps', 'identifying', 'location', 'elements', 'array', 'supposedly', 'store', 'data', 'e', 'price', 'ten', 'cars', 'create', 'structure', 'array', 'store', 'integers', 'together', 'need', 'creating', 'ten', 'separate', 'integer', 'variables', 'therefore', 

In [94]:
# #Step 4: Stemming (convert to base using dictionary) -> convert words to root words
# from nltk.stem import PorterStemmer
# k = []
# ps = PorterStemmer()
# for i in word:
#     # temp = ps.stem(i)
#     # if temp not in k:
#     k.append(ps.stem(i))
#     # print("CONTENT AFTER STEP 4 STEMMING: ",k)
# print("CONTENT AFTER STEP 4 STEMMING: ",k)

In [95]:
#Step 5: Stemming and Lemmatization (convert to base using dictionary) -> convert words to root words
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
lem_word=[]
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
for i in word:
    #if word ends with past tense or ing or ed then perform stemming else lemmatization
    if i.endswith('ed') or i.endswith('ing'):
        lem_word.append(ps.stem(i))
    else:
        lem_word.append(lemmatizer.lemmatize(i))
print("CONTENT AFTER STEP 5 LEMMATIZATION: ",lem_word)

CONTENT AFTER STEP 5 LEMMATIZATION:  ['data', 'structure', 'data', 'element', 'arrang', 'sequentially', 'linearly', 'every', 'element', 'attach', 'previous', 'next', 'adjacent', 'call', 'linear', 'data', 'structure', 'linear', 'data', 'structure', 'single', 'level', 'involv', 'therefore', 'traverse', 'element', 'single', 'run', 'linear', 'data', 'structure', 'easy', 'implement', 'computer', 'memory', 'arrang', 'linear', 'way', 'example', 'array', 'stack', 'queue', 'link', 'list', 'etc', 'array', 'array', 'type', 'data', 'structure', 'store', 'element', 'type', 'basic', 'fundamental', 'data', 'structure', 'data', 'store', 'position', 'array', 'given', 'positive', 'value', 'call', 'index', 'element', 'index', 'help', 'identifi', 'location', 'element', 'array', 'supposedly', 'store', 'data', 'e', 'price', 'ten', 'car', 'create', 'structure', 'array', 'store', 'integer', 'together', 'need', 'creat', 'ten', 'separate', 'integer', 'variable', 'therefore', 'line', 'code', 'reduc', 'memory', '

In [96]:
#CHECK THE SENTECE FORMED AFTER STEMMING AND LEMMATIZATION
word = ' '.join(lem_word)
print("CONTENT AFTER PREPROCESSING: ",word)

CONTENT AFTER PREPROCESSING:  data structure data element arrang sequentially linearly every element attach previous next adjacent call linear data structure linear data structure single level involv therefore traverse element single run linear data structure easy implement computer memory arrang linear way example array stack queue link list etc array array type data structure store element type basic fundamental data structure data store position array given positive value call index element index help identifi location element array supposedly store data e price ten car create structure array store integer together need creat ten separate integer variable therefore line code reduc memory save index value start first element case array


In [97]:
#Step 6: Named Entity Recognition
import spacy 
nlpl = spacy.load('en_core_web_sm')
doc = nlpl(word)
for ent in doc.ents:
    print(ent.text,ent.label_)


arrang GPE
linear ORG
linear ORG
linear ORG
arrang linear FAC
ten CARDINAL
ten CARDINAL
first ORDINAL


In [98]:
# Normal key word extraction
keyword1 = tc.keywordextract(word)
print("CONTENT AFTER STEP 7 NORMAL KEYWORD EXTRACTION: ",keyword1)
len(keyword1)

CONTENT AFTER STEP 7 NORMAL KEYWORD EXTRACTION:  [('create', 1), ('location', 1), ('data', 1), ('computer', 1), ('index', 1), ('array', 1), ('implement', 1), ('queue', 1), ('store', 1), ('value', 1), ('car', 1), ('run', 1), ('case', 1), ('adjacent', 1), ('way', 1)]


15

In [99]:
#keyword extraction using monekey learn
keyword2 = tc.monkeyword(word)
print("CONTENT AFTER STEP 7 MONKEYLEARN KEYWORD EXTRACTION: ",keyword2.keys())
len(keyword2)

linear data structure: 0.986842105263158
arrang linear way example array stack queue link list: 0.39473684210526316
array array type data structure store element type: 0.39473684210526316
basic fundamental data structure data store position array: 0.39473684210526316
data structure data element arrang: 0.39473684210526316
first element case array: 0.39473684210526316
identifi location element array: 0.39473684210526316
line code reduc memory: 0.39473684210526316
positive value call index element index: 0.39473684210526316
previous next adjacent call: 0.39473684210526316
CONTENT AFTER STEP 7 MONKEYLEARN KEYWORD EXTRACTION:  dict_keys(['linear data structure', 'arrang linear way example array stack queue link list', 'array array type data structure store element type', 'basic fundamental data structure data store position array', 'data structure data element arrang', 'first element case array', 'identifi location element array', 'line code reduc memory', 'positive value call index elemen

10

In [100]:
#keyword extraction using YAKE
import yake
key_extractor = yake.KeywordExtractor()
language = "en"
max_ngram_size = 3
deduplication_threshold = 0.5
numOfKeywords = 14
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
keyword3 = custom_kw_extractor.extract_keywords(word)
print("CONTENT AFTER STEP 7 YAKE KEYWORD EXTRACTION: ")
keyword3 = sorted(keyword3, key = lambda x: x[1],reverse=True)
len(keyword3)

CONTENT AFTER STEP 7 YAKE KEYWORD EXTRACTION: 


14

In [101]:
#LINKS ARRAY
import numpy as np
links=np.array([])

In [102]:
#SEARCH THROUGH KEYWORD 1
import search
c = []
for i in range(len(keyword1)):
    c.append(keyword1[i][0])

#Join the Keywords2
c = ' '.join(c)
#Search using the google API
k = search.search(c)
for i in k:
    links = np.append(links,i['link'])
    # print(i['link'])
links

array(['https://www.tutorialspoint.com/data_structures_algorithms/dsa_quick_guide.htm',
       'https://www.geeksforgeeks.org/linked-list-vs-array/',
       'https://www.techtarget.com/searchdatamanagement/definition/data-structure',
       'https://www.geeksforgeeks.org/check-possible-path-2d-matrix/',
       'https://www.freecodecamp.org/news/data-structures-101-arrays-a-visual-introduction-for-beginners-7f013bcc355a/',
       'https://introcs.cs.princeton.edu/14array',
       'https://www.softwaretestinghelp.com/linked-list-in-java/',
       'https://css-tricks.com/lots-of-ways-to-use-math-random-in-javascript/',
       'https://www.softwaretestinghelp.com/linked-list/',
       'https://dl.acm.org/doi/pdf/10.1145/273133.273160'], dtype='<U110')

In [103]:
#SEARCH THROUGH KEYWORD 2
#Join the Keywords
c = keyword2
c = ' '.join(c)
#Search using the google API
k = search.search(c)
for i in k:
    if i['link'] in links:
        continue
    else:
        links = np.append(links,i['link'])
    print(i['link'])
# links[1] == links[10]

https://www.geeksforgeeks.org/difference-between-linear-and-non-linear-data-structures/
https://www.freecodecamp.org/news/the-top-data-structures-you-should-know-for-your-next-coding-interview-36af0831f5e3/
https://www.simplilearn.com/tutorials/data-structure-tutorial/what-is-data-structure
https://towardsdatascience.com/8-common-data-structures-every-programmer-must-know-171acf6a1a42
https://www.programiz.com/dsa/data-structure-types
https://dev.to/gbengelebs/introduction-to-data-structures-ok3
https://en.wikipedia.org/wiki/Stack_(abstract_data_type)
https://www.interviewbit.com/data-structure-interview-questions/
https://afteracademy.com/blog/introduction-to-data-structure


In [104]:
#SEARCH THROUGH KEYWORD 3
c = []
for i in range(len(keyword1)):
    c.append(keyword1[i][0])
#Join the Keywords
c = ' '.join(c)
k = search.search(c)
for i in k:
    if i['link'] not in links:
        links = np.append(links,i['link'])
    print(i['link'])

https://www.tutorialspoint.com/data_structures_algorithms/dsa_quick_guide.htm
https://www.geeksforgeeks.org/linked-list-vs-array/
https://www.techtarget.com/searchdatamanagement/definition/data-structure
https://www.geeksforgeeks.org/check-possible-path-2d-matrix/
https://www.freecodecamp.org/news/data-structures-101-arrays-a-visual-introduction-for-beginners-7f013bcc355a/
https://introcs.cs.princeton.edu/14array
https://www.softwaretestinghelp.com/linked-list-in-java/
https://css-tricks.com/lots-of-ways-to-use-math-random-in-javascript/
https://www.softwaretestinghelp.com/linked-list/
https://dl.acm.org/doi/pdf/10.1145/273133.273160


In [105]:
#TOTAL NUMBER OF UNIQUE LINKS
len(links)

19

In [106]:
# Find the links which are scrapable from the links array usin beautiful soup
import requests
from bs4 import BeautifulSoup
import PyPDF2

scrapable_links = []

for link in links:
    try:
        response = requests.get(link)
        if response.status_code == 200:
            content_type = response.headers.get('content-type')
            if content_type.startswith('text/html'):
                soup = BeautifulSoup(response.content, 'html.parser')
                if soup is not None:
                    scrapable_links.append(link)
            elif content_type == 'application/pdf':
                pdf_file = PyPDF2.PdfFileReader(response.content)
                if pdf_file.getNumPages() > 0:
                    scrapable_links.append(link)
    except:
        pass

print(scrapable_links)


['https://www.tutorialspoint.com/data_structures_algorithms/dsa_quick_guide.htm', 'https://www.geeksforgeeks.org/linked-list-vs-array/', 'https://www.techtarget.com/searchdatamanagement/definition/data-structure', 'https://www.geeksforgeeks.org/check-possible-path-2d-matrix/', 'https://www.freecodecamp.org/news/data-structures-101-arrays-a-visual-introduction-for-beginners-7f013bcc355a/', 'https://introcs.cs.princeton.edu/14array', 'https://css-tricks.com/lots-of-ways-to-use-math-random-in-javascript/', 'https://www.geeksforgeeks.org/difference-between-linear-and-non-linear-data-structures/', 'https://www.freecodecamp.org/news/the-top-data-structures-you-should-know-for-your-next-coding-interview-36af0831f5e3/', 'https://www.simplilearn.com/tutorials/data-structure-tutorial/what-is-data-structure', 'https://towardsdatascience.com/8-common-data-structures-every-programmer-must-know-171acf6a1a42', 'https://www.programiz.com/dsa/data-structure-types', 'https://dev.to/gbengelebs/introducti

In [107]:
print("TOTAL NUMBER OF SCRAPABLE LINKS: ",len(scrapable_links))

TOTAL NUMBER OF SCRAPABLE LINKS:  15


In [108]:
#Paragraph Extraction from the scrapable links
suspicous_paragraphs = []

In [109]:
import requests
from bs4 import BeautifulSoup
import PyPDF2

links = scrapable_links

for link in links:
    try:
        response = requests.get(link)
        content_type = response.headers.get('content-type')
        if content_type.startswith('text/html'):
            soup = BeautifulSoup(response.content, 'html.parser')
            # Extract the page title from the HTML page
            title = soup.title.string if soup.title else ''
            print(f"Scraping HTML content from {link} - Page title: {title}")
            # Extract the text from the HTML page
            text = soup.get_text()
            suspicous_paragraphs.append(text)
        elif content_type == 'application/pdf':
            pdf_file = PyPDF2.PdfFileReader(response.content)
            if pdf_file.getNumPages() > 0:
                # Extract the text from the PDF file
                text = ''
                for i in range(pdf_file.getNumPages()):
                    page = pdf_file.getPage(i)
                    text += page.extractText()
                suspicous_paragraphs.append(text)
                print(f"Scraping PDF content from {link} - Text: {text}")
        else:
            print(f"Content type not supported for {link}")
    except:
        print(f"Error fetching content from {link}")


Scraping HTML content from https://www.tutorialspoint.com/data_structures_algorithms/dsa_quick_guide.htm - Page title: Data Structures & Algorithms - Quick Guide
Scraping HTML content from https://www.geeksforgeeks.org/linked-list-vs-array/ - Page title: Linked List vs Array - GeeksforGeeks
Scraping HTML content from https://www.techtarget.com/searchdatamanagement/definition/data-structure - Page title: What are Data Structures? - Definition from WhatIs.com
Scraping HTML content from https://www.geeksforgeeks.org/check-possible-path-2d-matrix/ - Page title: Check for possible path in 2D matrix - GeeksforGeeks
Scraping HTML content from https://www.freecodecamp.org/news/data-structures-101-arrays-a-visual-introduction-for-beginners-7f013bcc355a/ - Page title: Data Structures 101: Arrays — A Visual Introduction for Beginners
Scraping HTML content from https://introcs.cs.princeton.edu/14array - Page title: 
Arrays
Scraping HTML content from https://css-tricks.com/lots-of-ways-to-use-math-

In [110]:
print("TOTAL NUMBER OF SUSPICIOUS PARAGRAPHS: ",len(suspicous_paragraphs))

TOTAL NUMBER OF SUSPICIOUS PARAGRAPHS:  15


In [111]:
#2D array of suspicious paragraphs
row = []
temp = []
preprocessed_suspicious_paragraph=[]

In [112]:
#Text Preprocessing
for i in suspicous_paragraphs:
    #feauture extraction
    #stop word count
    temp1 = clean_text_regex.stopwordcount(paragraph)
    if temp1==0:
        temp.append(0)
    else:
        temp.append(clean_text_regex.stopwordcount(i)/temp1)
   
    #punctuation count
    temp2 = clean_text_regex.punctuation_count(paragraph)
    if temp2==0:
        temp.append(0)
    else:
        temp.append(clean_text_regex.punctuation_count(i)/temp2)
    
    #Step 1: text cleaning -> punctuation removal,special characters and numbers removal
    sus_para = tc.clean_text(i)

    #Step 2: tokenization -> split text to words or tokens
    sus_token = tc.tokenize_text(sus_para)
    print("CONTENT AFTER STEP 2 TOKENIZE: ",sus_token)

    #Step 3: stop word removal 
    sus_token = tc.stopwordremove(sus_token)

    #Step 5: Stemming and Lemmatization (convert to base using dictionary) -> convert words to root words
    lem_word=[]
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    for i in sus_token:
        
        #if sus_token ends with past tense or ing or ed then perform stemming else lemmatization
        if i.endswith('ed') or i.endswith('ing'):
            lem_word.append(ps.stem(i))
        else:
            lem_word.append(lemmatizer.lemmatize(i))

    sus_token = ' '.join(lem_word)
    preprocessed_suspicious_paragraph.append(sus_token)
    # row[i].append(temp)
    row.append(temp)
    temp = []

CONTENT AFTER STEP 2 TOKENIZE:  ['data', 'structures', 'algorithms', 'quick', 'guide', 'home', 'coding', 'ground', 'jobs', 'whiteboard', 'tools', 'corporate', 'training', 'teach', 'with', 'us', 'login', 'category', 'academic', 'tutorials', 'big', 'data', 'analytics', 'computer', 'programming', 'computer', 'science', 'databases', 'devops', 'digital', 'marketing', 'engineering', 'tutorials', 'exams', 'syllabus', 'famous', 'monuments', 'gate', 'exams', 'latest', 'technologies', 'machine', 'learning', 'mainframe', 'development', 'management', 'tutorials', 'mathematics', 'tutorials', 'microsoft', 'technologies', 'misc', 'tutorials', 'mobile', 'development', 'java', 'technologies', 'python', 'technologies', 'sap', 'tutorials', 'programming', 'scripts', 'selected', 'reading', 'software', 'quality', 'soft', 'skills', 'telecom', 'tutorials', 'upsc', 'ias', 'exams', 'web', 'development', 'sports', 'tutorials', 'xml', 'technologies', 'multi', 'language', 'interview', 'questions', 'prime', 'packs'

In [113]:
row

[[86.04597701149426, 221.76],
 [11.0, 46.08],
 [11.172413793103448, 17.04],
 [9.931034482758621, 130.04],
 [9.045977011494253, 14.84],
 [22.50574712643678, 99.84],
 [4.747126436781609, 12.76],
 [7.574712643678161, 17.16],
 [8.333333333333334, 12.8],
 [25.95402298850575, 38.96],
 [10.885057471264368, 16.6],
 [4.64367816091954, 8.28],
 [11.195402298850574, 17.72],
 [19.95402298850575, 62.84],
 [5.574712643678161, 11.64]]

In [114]:
j=0
for i in preprocessed_suspicious_paragraph:
    #feauture extraction
    
    #text after punctuation removal, stop word removal, numbers removal and extra whitespace removal
    original = clean_text_regex.clean_text(i)

    #word overlap ratio
    row[j].append(wordoverlapratio.word_overlap_ratio(original,word))

    #cosine similarity
    row[j].append(cosine.cosine_similarity(original,word))

    #jaccard similarity
    row[j].append(jaccard.jaccard_similarity(original,word))

    #euclidean distance
    row[j].append(euclidean_distance.euclidean(original,word))

    #Leshtein Distance
    row[j].append(levenshtein.levenshtein_distance(original,word))
    j+=1


0.004371407755201165
0.01206140350877193
0.025377643504531724
0.010720887245841035
0.03252885624344176
0.014098045498237744
0.02699228791773779
0.03271767810026385
0.031144781144781145
0.014121338912133892
0.027679205110007096
0.030217186024551465
0.02574146614437605
0.013084112149532711
0.038802660753880266


In [115]:
row[0]

[86.04597701149426,
 221.76,
 0.004371407755201165,
 0.45507277795756695,
 0.039045553145336226,
 879.5271456868173,
 79240]

In [116]:
#load the model finalized_model.sav
import pickle
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
#use the model to predict the output
for i in row:
    inp = [i]
    result = loaded_model.predict(inp)
    print(result)

[75.38634797]
[13.32595039]
[6.64892226]
[35.81762783]
[6.01715871]
[31.1909972]
[4.36825377]
[5.12602112]
[5.04412465]
[15.03579572]
[6.65255134]
[2.93920559]
[6.74499839]
[20.26397951]
[4.20669859]




In [117]:
# 1 and 15 and 4th link is suspicious
# scrapable_links[0]
# scrapable_links[14]
scrapable_links[6]

'https://css-tricks.com/lots-of-ways-to-use-math-random-in-javascript/'