In [6]:
import pandas as pd

data = pd.read_csv(r"C:\Users\A N A N T H\Downloads\articles.csv", encoding='latin1')

In [7]:
data.head()

Unnamed: 0,Id,Heading,Article.Banner.Image,Outlets,Article.Description,Full_Article,Article_Type,Tonality
0,d6995462-5e87-453b-b64d-e9f1df6e94d2,"A Puzzling Maneuver, Then Freefall: NTSB Repor...",,Essex Caller,<p>The helicopter that crashed in Southeast Al...,<p>The helicopter that crashed in Southeast Al...,Commercial,Negative
1,8b05e939-a89e-4548-b92b-013822e8ee7d,Bells Nexus Air Taxi Concept Rings Changes Fo...,,Aviation Week Network,<p>A year after teasing the fledgling electric...,<p>A year after teasing the fledgling electric...,Commercial,Positive
2,69fcd400-bceb-4255-8277-619f2d68ac0b,Bell Helicopter Show Air Taxi Nexus,http://images.tmtpost.com/uploads/images/2019/...,TMTPost,<p>Bell released the full-size design of the v...,<p>Bell released the full-size design of the v...,Commercial,Positive
3,17943578-c11b-414b-b3f5-063d3a93157b,BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...,http://www.fredzone.org/wp-content/uploads/201...,Fredzone,<p>Bell est une soci&eacute;t&eacute; am&eacut...,<p>Bell est une soci&eacute;t&eacute; am&eacut...,Commercial,Positive
4,f33c7b11-5f77-4a98-bb2e-d36689042aea,Les premiers retours dOlivier Ezratty,,FrenchWeb,<p>It was still anecdotal to observe the explo...,<p>It was still anecdotal to observe the explo...,Commercial,Positive


In [8]:
data['Heading']

0       A Puzzling Maneuver, Then Freefall: NTSB Repor...
1       Bells Nexus Air Taxi Concept Rings Changes Fo...
2                     Bell Helicopter Show Air Taxi Nexus
3       BELL DÉVOILE LA CONCEPTION INTÉGRALE DE SON TA...
4                  Les premiers retours dOlivier Ezratty
                              ...                        
4300    Chinook Catches Army Flirting With Younger, Th...
4301    Lufthansa Aviation Selects Reiser Simulation H...
4302     This Bell Flight Drone Wont Be Delivering Pizza
4303    Blade Offers New York Airport Transfers for $1...
4304                  US Little Birds Flying to Lebanon
Name: Heading, Length: 4305, dtype: object

In [14]:
import re
def remove_non_alphabets(text):
    text = re.sub('<p>|</p>', '', text)
    return re.sub('[^a-zA-Z]', '', text)

data['Heading_clean'] = data['Heading'].apply(remove_non_alphabets)
data['Article_Description_clean'] = data['Article.Description'].apply(remove_non_alphabets)
data['Full_Article_clean'] = data['Full_Article'].apply(remove_non_alphabets)

In [19]:
clean_data = data.dropna(subset=['Heading_clean', 'Article_Description_clean', 'Full_Article_clean', 'Article_Type'])

In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

clean_data['Combined_text'] = (clean_data['Heading_clean'] + " " + 
                                 clean_data['Article_Description_clean'] + " " + 
                                 clean_data['Full_Article_clean'])

text_embeddings = model.encode(clean_data['Combined_text'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/135 [00:00<?, ?it/s]

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(text_embeddings, clean_data['Article_Type'], test_size=0.2, random_state=42)

clf = RandomForestClassifier()


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
}

# Initialize GridSearchCV with the classifier
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")


Best Parameters: {'max_depth': 20, 'n_estimators': 200}


In [24]:
clf = RandomForestClassifier(max_depth= best_params['max_depth'], n_estimators=best_params['n_estimators'])
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7804878048780488


In [27]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
print(f"Accuracy: {scores}")


Accuracy: [0.76777939 0.78374456 0.75036284 0.7532656  0.77034884]


In [31]:
from sklearn.metrics import  precision_score, recall_score, f1_score

accuracy_scores = accuracy_score(y_test, y_pred)
precision_scores = precision_score(y_test, y_pred, average='weighted')
recall_scores = recall_score(y_test, y_pred, average='weighted')
f1_scores = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy_scores}")
print(f"Precision: {precision_scores}")
print(f"Recall: {recall_scores}")
print(f"F1-Score: {f1_scores}")

Accuracy: 0.7804878048780488
Precision: 0.7986400535155113
Recall: 0.7804878048780488
F1-Score: 0.7631739064740408


In [32]:
import joblib

# Save the model to disk
joblib.dump(clf, 'text_classification_model.pkl')

# Reload the model from disk
clf_loaded = joblib.load('text_classification_model.pkl')


In [33]:
test_data = pd.read_csv(r"C:\Users\A N A N T H\Downloads\unknown_articles.csv", encoding='latin1')

In [63]:
import requests
from bs4 import BeautifulSoup

def extract_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    heading = soup.find('h1').text
    full_article = soup.find('p1').text
    
    return heading, full_article


In [64]:
headings = []
f_a = []
for url in test_data['Article.URL'][:10]:
    h, a = extract_article(url)
    headings.append(h)
    f_a.append(a)

AttributeError: 'NoneType' object has no attribute 'text'

In [65]:
headings

[]