In [1]:
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stopwords = nltk.corpus.stopwords.words('english')
lemmetizer = nltk.WordNetLemmatizer()

def remove_stop_words(text):
    text = [w for w in text if w.lower() not in stopwords]
    return text

def lemmetize_words(word_list):
    lemmetized = [lemmetizer.lemmatize(w) for w in word_list]
    return lemmetized

def remove_special_characters(text):
    text = [w for w in text if w.isalpha()]
    return text

def remove_non_english_words(text):
    printable = set(string.printable)
    return [word for word in text 
            if all(char in printable for char in word)]

def fix_text(row):
    return ' '.join(row)

df = pd.read_csv('train.csv')
df = df.drop('author', axis=1)
df = df.dropna()
df['final_text'] = df['text'].apply(nltk.word_tokenize)
df['final_text'] = df['final_text'].apply(remove_special_characters)
df['final_text'] = df['final_text'].apply(remove_non_english_words)
df['final_text'] = df['final_text'].apply(remove_stop_words)
df['final_text'] = df['final_text'].apply(lemmetize_words)
df['final_text'] = df['final_text'].apply(fix_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Casper\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Casper\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Casper\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Casper\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
vec_single = CountVectorizer()
vec_single.fit(df['final_text'])
bag_of_words = vec_single.transform(df['final_text'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec_single.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

x_single = []
count_single = []

for word, freq in words_freq[:20]:
    x_single.append(word)
    count_single.append(freq)

plt.barh(x_single, count_single)
plt.title("Top 20 words used in news article")
plt.xlabel('Count')
plt.show()

In [None]:
vec_double = CountVectorizer(ngram_range=(2,2))
vec_double.fit(df['final_text'])
bag_of_words = vec_double.transform(df['final_text'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec_double.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

x_double = []
count_double = []

for word, freq in words_freq[:20]:
    x_double.append(word)
    count_double.append(freq)

plt.barh(x_double, count_double)
plt.title('Top 20 bigrams used in news articles')
plt.xlabel('Count')
plt.show()

In [None]:
vec_triple = CountVectorizer(ngram_range=(3,3))
vec_triple.fit(df['final_text'])
bag_of_words = vec_triple.transform(df['final_text'])
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec_triple.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)

x_triple = []
count_triple = []

for word, freq in words_freq[:20]:
    x_triple.append(word)
    count_triple.append(freq)

plt.barh(x_triple, count_triple)
plt.title('Top 20 trigrams used in news articles')
plt.xlabel('Count')
plt.show()

In [2]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['final_text'], df['label'], test_size=0.2)

Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df['final_text'])
Train_X_Tfidf = Tfidf_vect.transform(train_x)
Test_X_Tfidf = Tfidf_vect.transform(test_x)

In [3]:
from sklearn.model_selection import GridSearchCV

parameters = {'C':[1, 10, 50, 100]}
SVM_model_2 = svm.SVC()
svm_grid = GridSearchCV(SVM_model_2, parameters)
svm_grid.fit(Train_X_Tfidf, train_y)

In [None]:
print(f'The best parameters for this model are: {svm_grid.best_params_} and score is {svm_grid.best_score_}')

In [None]:
SVM_model = svm.SVC()
SVM_model.fit(Train_X_Tfidf,train_y)
predictions_SVM = SVM_model.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, test_y)*100)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

cm_svm = confusion_matrix(predictions_SVM, test_y)
sns.heatmap(cm_svm/np.sum(cm_svm), annot=True, fmt='0.2%')

In [None]:
knn = KNeighborsClassifier()
parameters = {'n_neighbors' : [1,2,3,4,5,6,7,8,9,10]}
knn_grid = GridSearchCV(knn, parameters, scoring='accuracy')
knn_grid.fit(Train_X_Tfidf, train_y)
knn_grid_pred = knn_grid.predict(Test_X_Tfidf)

In [None]:
print(f'The best parameters for this model are: {knn_grid.best_params_} and score is {knn_grid.best_score_}')
plt.title('Scores of Different Amount of Neighbours')
plt.ylabel('Accuracy Score')
plt.xlabel('Amount of Neigbours')
plt.plot(knn_grid.cv_results_['mean_test_score'])

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

cm_knn = confusion_matrix(knn_grid_pred, test_y)
plt.title('KNN Heatmap')
sns.heatmap(cm_knn/np.sum(cm_knn), annot=True, fmt='0.2%')
plt.show()

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

page = requests.get("https://www.bbc.com/news")

soup = BeautifulSoup(page.content, "html.parser")

df = pd.DataFrame(columns=['Text'])
def find_text(link):
    p = requests.get(link)
    s = BeautifulSoup(p.content, "html.parser")
    text = s.find_all(class_="ssrcss-1q0x1qg-Paragraph eq5iqo00")
    # author = s.find(class_ = "ssrcss-68pt20-Text-TextContributorName e8mq1e96")
    list_of_text = []
    for t in text:
        list_of_text.append(t.text)
    return ' '.join(list_of_text)

In [None]:
def article_pred(link):
    text = find_text(link)
    text = nltk.word_tokenize(text)
    text = remove_special_characters(text)
    text = remove_stop_words(text)
    text = lemmetize_words(text)
    text = fix_text(text)
    text = Tfidf_vect.transform([text])
    test_pred = SVM_model.predict(text)
    return test_pred

article_pred('https://www.bbc.com/news/world-us-canada-63463738')