# Tesla News Classifier

## import necessary libraries


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import joblib
import os
from transformers import pipeline

# Load Datasets and Label Data

In [24]:

df1= pd.read_csv('guardian_headlines.csv')  # General news
df2 = pd.read_csv('tesla_news_cleaned.csv')      # Tesla related data
#all cleaned tesla news is relevant the other news should contain keywords
keywords = ['tesla','elon musk','cybertruck','electric vehicle']
df1['label'] = df1['title'].str.lower().str.contains('|'.join(keywords)).astype(int)
df2['label'] = 1
#combine the datasets and clean
df = pd.concat([df1, df2], ignore_index=True)
df = df[['date', 'title', 'label']].dropna(subset=['title'])
df = df[df['title'].str.strip() != '']
df.info()
df.head()



<class 'pandas.core.frame.DataFrame'>
Index: 183034 entries, 0 to 183071
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    183034 non-null  object
 1   title   183034 non-null  object
 2   label   183034 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 5.6+ MB


Unnamed: 0,date,title,label
0,18-Jul-20,Johnson is asking Santa for a Christmas recovery,0
1,18-Jul-20,‘I now fear the worst’: four grim tales of wor...,0
2,18-Jul-20,Five key areas Sunak must tackle to serve up e...,0
3,18-Jul-20,Covid-19 leaves firms ‘fatally ill-prepared’ f...,0
4,18-Jul-20,The Week in Patriarchy \n\n\n Bacardi's 'lad...,0


## Cleaning the Data

In [25]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'http\S+', '', text)          # Remove URLs
    text = re.sub('[^a-zA-Z]', ' ', text)        # Keep only letters
    text = re.sub('\s+', ' ', text).lower()      # Remove extra spaces and lowercase evrything
    return text.strip()

df.dropna(subset = ['date'])# remove all rows without dates
df['clean_text'] = df['title'].astype(str)
df['clean_text'] = df['clean_text'].apply(clean_text)
cols_to_keep = ['date','clean_text','label'] #only keep columns to be used for training
df = df.drop(columns = [col for col in df.columns if col not in cols_to_keep])
df.info()

  text = re.sub('\s+', ' ', text).lower()      # Remove extra spaces and lowercase evrything


<class 'pandas.core.frame.DataFrame'>
Index: 183034 entries, 0 to 183071
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        183034 non-null  object
 1   label       183034 non-null  int64 
 2   clean_text  183034 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.6+ MB


## Labelling The Data


In [26]:
#keywords = ['tesla','elon musk','cybertruck']
#df['label'] = df['clean_text'].apply(lambda x: 1 if any(keyword in x for keyword in keywords) else 0)
#df.head()

## Balancing the dataset

In [27]:
from sklearn.utils import resample

print("\nCounts before balancing: ")
print(df['label'].value_counts())
# reduce sample size for tesla specific data

df_tesla = df[df['label'] ==1]
df_unrelated =df[df['label']==0]
#balance the tesla data with the unrelated data
balanced_tesla = resample(df_tesla, n_samples=len(df_unrelated), random_state=42)
df_balanced = pd.concat([balanced_tesla, df_unrelated])
print("\nCounts after balancing: ")
print(df_balanced['label'].value_counts())





Counts before balancing: 
label
1    165339
0     17695
Name: count, dtype: int64

Counts after balancing: 
label
1    17695
0    17695
Name: count, dtype: int64


# Training The Model

In [28]:
from sklearn.metrics import classification_report
#split into train and test sets
X = df_balanced['clean_text']  # Text data
y = df_balanced['label']       # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,random_state=42
)

#vectorize and train
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

#Train the model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluate the model
print("Training Accuracy:", model.score(X_train_vec, y_train))
y_pred = model.predict(X_test_vec)
print("Testing Accuracy:", model.score(X_test_vec, y_test))
print(classification_report(y_test, y_pred))

#save the model

joblib.dump(model, 'tesla_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')


Training Accuracy: 0.9585334840350381
Testing Accuracy: 0.9580389940661204
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      3539
           1       0.98      0.93      0.96      3539

    accuracy                           0.96      7078
   macro avg       0.96      0.96      0.96      7078
weighted avg       0.96      0.96      0.96      7078



['tfidf_vectorizer.joblib']

# Prediction Function

In [29]:
#creating a prediction function
def predict_relevance(text):
  cleaned = clean_text(text)
  vector = vectorizer.transform([cleaned])
  prediction = model.predict(vector)
  return 'Relevant' if prediction[0] == 1 else 'Irrelevant'

In [30]:
#test out prediction function
new_article = "Tesla delivers 499 out of 550 vehicles in 2020 just shy of the target"
print(predict_relevance(new_article))
print(predict_relevance("Zimbabwe's inflation rate has increased to 12.7%"))


Relevant
Irrelevant
