In [1]:
import matplotlib.pyplot as plt
import nltk
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from urllib.request import urlopen
from bs4 import BeautifulSoup

### Previously Defined functions

In [2]:
def ask_user():
    print('Do you have an URL link in mind?,type yes or no for the answer')
    l=str(input())
    if l=='yes':
        print('What is it?')
        link=input()
    else:
        print('We will use the default URL link')
        link='https://edmonton.ctvnews.ca/new-covid-19-restrictions-alberta-bans-all-gatherings-closes-most-businesses-and-makes-masks-mandatory-1.5221814'
    print('Do you have a desired number of features to use? If yes, type yes otherwise type no')
    ll=str(input())
    if ll=='yes':
        print('What is the number of features that you want?, must be an integer')
        n_1=int(input())
    else:
        print('We will use 20 number of features as default')
        n_1=20
    return link,n_1

In [3]:
def final_classification(clas_1,n_fraction):
    count=0
    for element in clas_1:
        if element =='Relevant':
            count=count+1
    if float(count/len(clas_1))>=n_fraction:
        return('The analyzed text is relevant, with a relevant number of:'+str(round(float(count/len(clas_1)),2)*100)+"%")
    else:
        return('The analyzed text is not-relevant, with a relevant number of:'+str(round(float(count/len(clas_1)),2)*100)+"%")
    

In [4]:
def Rel_nonRel_oneszeros(n_observations,f_matrix,Features,n_repeated):
    labels=[]
    i=0
    while i< n_observations:
        row=f_matrix[i,:]
        total=0
        for number in row:
            if number ==1:
                count=1
                total=total+count
        if total>=n_repeated:
            labels.append('Relevant')
        else:
            labels.append('Non-relevant')
        i=i+1
    return labels

In [5]:
def freq_words(clean_tokens,new):
    sr= stopwords.words('english')
    for element in sr:
        new.append(element)
    c_l=[]
    for tokens in clean_tokens:
        if tokens not in new:
            c_l.append(tokens)
    return c_l

## Analysis of Text starts here

### Asking user if it has an URL in mind and a number of desired features

In [6]:
User=ask_user()

Do you have an URL link in mind?,type yes or no for the answer
no
We will use the default URL link
Do you have a desired number of features to use? If yes, type yes otherwise type no
yes
What is the number of features that you want?, must be an integer
20


In [7]:
File_text= open("Alberta_New_Covid.txt",encoding="utf8")
R_file1=File_text.read()
Text_file=word_tokenize(R_file1)

### Asking the user which website would one wants, and how many features would one need to analyze the text

In [8]:
html = urlopen(User[0]).read().decode("utf8")

In [9]:
n_desired_features=User[1]

In [10]:
raw = BeautifulSoup(html).get_text()
raw_tokenize = nltk.word_tokenize(raw)
len(raw_tokenize)

14510

In [11]:
New=['The','As','(',')','.','I',',','We','ïƒ¼','â€¢',':','$','--','"','[',']']
for element in raw_tokenize:
    if element not in Text_file:
        New.append(element)
New=list(set(New))

In [12]:
clean_html=freq_words(raw_tokenize,New)
print(clean_html)

['let', 'new', 'COVID-19', 'restrictions', 'Alberta', 'gatherings', 'businesses', 'masks', 'mandatory', 'COVID-19', 'restrictions', 'Alberta', 'gatherings', 'businesses', 'masks', 'mandatory', 'Alberta', 'banning', 'social', 'gatherings', 'closing', 'restaurants', 'gyms', 'hair', 'salons', 'making', 'masks', 'mandatory', 'across', 'province', 'curb', 'spread', 'COVID-19', 'restrictions', 'Alberta', 'gatherings', 'businesses', 'masks', 'mandatory', 'Alberta', 'banning', 'social', 'gatherings', 'closing', 'restaurants', 'gyms', 'hair', 'salons', 'making', 'masks', 'mandatory', 'across', 'province', 'curb', 'spread', 'COVID-19', 'EDMONTON', 'Edmonton', 'EDMONTON', 'COVID-19', 'restrictions', 'Alberta', 'gatherings', 'businesses', 'masks', 'mandatory', 'Tuesday', 'Tuesday', 'home', 'new', 'used', 'new', 'used', 'new', 'current', 'new', 'currently', 'currently', 'currently', 'new', 'used', 'new', 'used', 'used', 'work', 'current', 'Alberta', 'new', 'COVID-19', 'restrictions', 'Alberta', 'go

In [13]:
vectorize=CountVectorizer(max_features=n_desired_features)
X = vectorize.fit_transform(clean_html).toarray()

In [14]:
Features=list(vectorize.get_feature_names())
Features

['19',
 'alberta',
 'businesses',
 'cases',
 'covid',
 'edmonton',
 'gatherings',
 'health',
 'hinshaw',
 'home',
 'jason',
 'kenney',
 'mandatory',
 'masks',
 'new',
 'premier',
 'province',
 'restaurants',
 'restrictions',
 'said']

In [15]:
Feature_labels=Rel_nonRel_oneszeros(len(X),X,Features,1)

### Splitting Data to train Random Forest Classifier and Test it

In [16]:
X_train, X_test, y_train, y_test = train_test_split( X, Feature_labels, test_size=0.25, random_state=42)

In [17]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Using a Random Forest classifier class

In [18]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [19]:
y_predict1=clf.predict(X_test)

### Testing the accuracy of the Random Forest regressor using a Confusion Matrix

In [20]:
confusion_matrix(y_test, y_predict1)

array([[135,   0],
       [  0,  63]], dtype=int64)

### Classifying a new text(type file) either "relevant" or "not relevant", based on our previously defined model.The new text was obtained from https://www.cbc.ca/news/world/russia-vaccine-caution-1.5833611 .

In [21]:
Russia_sput=open("Russia_Sputnik.txt",encoding="utf8")
Russia_text=Russia_sput.read()
Russia_raw=word_tokenize(Russia_text)
Russia_filtered=freq_words(Russia_raw,New)
print(Russia_filtered)

['Russia', 'Sputnik', 'V', 'program', 'started', 'facing', 'resistance', 'Social', 'Sharing', 'Facebook', 'Twitter', 'Email', 'Reddit', 'LinkedIn', 'hesitancy', 'significant', 'concern', 'Russia', 'begins', 'mass', 'inoculation', 'Brown', '·', 'CBC', '·', 'Posted', 'Dec', '09', '4:00', 'AM', 'ET', 'hours', 'ago', 'nurse', 'Clinic', 'No', '68', 'Moscow', 'prepares', 'two', 'doses', 'Sputnik', 'V', 'volunteers', 'country', 'national', 'campaign', 'Tens', 'thousands', 'health', 'teachers', 'military', 'personnel', 'others', 'government', 'connections', 'taken', 'prior', 'launch', 'current', 'campaign', 'Corinne', 'Seminoff/CBC', '477', 'comments', 'Russia', 'mass', 'COVID-19', 'campaign', 'got', 'week', 'thousands', 'Russians', 'rolled', 'sleeves', 'volunteered', 'among', 'get', 'arms', 'jabbed', 'dose', 'Sputnik', 'V.', 'Many', 'others', 'however', 'appear', 'holding', 'see', 'things', 'turn', 'People', 'worried', 'understand', 'see', 'controversy', 'said', 'Dr.', 'Yevgeny', 'Timakov', '

In [22]:
X_russia=vectorize.transform(Russia_filtered).toarray()

In [23]:
Russia_predict=clf.predict(X_russia)

### Classifying  based on the average of "relevant" vs "not- relevant" of the y prediction in the random forest classifier model.

In [24]:
final_classification(Russia_predict,0.5)

'The analyzed text is not-relevant, with a relevant number of:2.0%'