## Importing necessary libraries

In [55]:
import nltk
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import sys, os
from warnings import simplefilter
if not sys.warnoptions:
    simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from pprint import pprint

In [50]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading the dataset

In [51]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [52]:
train_df.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


In [53]:
test_df.head()

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...


## spliting the data into training and validation sets

In [54]:
X_train, X_val, y_train, y_val = train_test_split(train_df['TweetText'], train_df['Label'], test_size=0.2, random_state=42)

In [56]:
def preprocess_text_data(tweet_text):
    # Remove URLs, mentions, and special characters
    tweet_text = re.sub(r"http\S+|www\S+|https\S+", '', tweet_text, flags=re.MULTILINE)
    tweet_text = re.sub(r'\@\w+|\#', '', tweet_text)
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text)

    # Tokenize the text
    tokens = word_tokenize(tweet_text)

    # Convert tokens to lowercase
    lowercase_tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stopwords_list = stopwords.words('english')
    filtered_tokens = [token for token in lowercase_tokens if token not in stopwords_list]

    # Join the tokens back into a string
    preprocessed_tweet = ' '.join(filtered_tokens)
    
    return preprocessed_tweet


## preprocessing the tweet text

In [57]:
train_df['TweetText'] = train_df['TweetText'].apply(preprocess_text_data)
test_df['TweetText'] = test_df['TweetText'].apply(preprocess_text_data)

In [36]:
# create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# fit the vectorizer on the training data and transform the training and validation data
X_train = vectorizer.fit_transform(X_train).toarray()
X_val = vectorizer.transform(X_val).toarray()
X_test = vectorizer.transform(test_df['TweetText']).toarray()

In [37]:
# convert the target labels to numerical values
y_train = np.where(y_train == 'Politics', 1, 0)
y_val = np.where(y_val == 'Politics', 1, 0)

In [38]:
print(y_train)

[1 1 0 ... 1 1 1]


In [12]:
# train a linear SVM model
clf = LinearSVC(max_iter=5000)

In [13]:
# grid search for hyperparameter tuning
params = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(clf, params, cv=5)
grid_search.fit(X_train, y_train)



GridSearchCV(cv=5, estimator=LinearSVC(max_iter=5000),
             param_grid={'C': [0.1, 1, 10, 100]})

In [14]:
# get the best hyperparameters and evaluate on the validation set
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print('Validation accuracy:', acc)

Validation accuracy: 0.9486590038314177


In [16]:
# make predictions on the test data
test_pred = best_clf.predict(X_test)

In [17]:
print(test_pred)

[0 0 1 ... 0 1 0]


In [18]:
# convert the numerical labels back to text labels
test_pred = np.where(test_pred == 1, 'Politics', 'Sports')

In [None]:
# create submission file
submission_df = pd.DataFrame({'TweetId': test_df['TweetId'], 'Label': test_pred})
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df

## Defining the model and Hyper-Parameter Tuning

In [58]:
X, y = train_df.TweetText, train_df.Label
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, stratify=y
)

clf_pipe = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ('ensemble', VotingClassifier(
        voting='soft',
        estimators=[
            ('bnb', BernoulliNB()),
            ('lgreg', LogisticRegression()),
            ('rndf', RandomForestClassifier())
        ]
    ))
])

clf_params = {
    'vect__stop_words': ['english'],
    "vect__max_df": [0.5, 1.0],
    'vect__max_features': (5000, 10000),
    "vect__ngram_range": ((1, 1), (1, 2)),
    'tfidf__use_idf': [False, True],
    'tfidf__norm': ('l1', 'l2'),
    'ensemble__bnb__alpha': np.linspace(0.5, 10, 10),
    'ensemble__bnb__fit_prior': [True, False],
    'ensemble__lgreg__max_iter': (100, 1000),
    'ensemble__lgreg__solver': ['liblinear'],
    'ensemble__lgreg__penalty': ['l1', 'l2'],
    'ensemble__lgreg__C': np.linspace(10, 1000, 10),
    'ensemble__rndf__max_depth': [10, 20],
    'ensemble__rndf__min_samples_leaf': [2, 4],
    'ensemble__rndf__min_samples_split': [5, 10],
    'ensemble__rndf__n_estimators': [300, 600],
}

rsf = RepeatedStratifiedKFold()
clf = RandomizedSearchCV(clf_pipe, clf_params, scoring='roc_auc', verbose=1, cv=rsf)
clf.fit(X_train, y_train)

print("Best Score: ", clf.best_score_)
print("Best Params: ", end="")
pprint(clf.best_params_)

Fitting 50 folds for each of 10 candidates, totalling 500 fits
Best Score:  0.9859654752114662
Best Params: {'ensemble__bnb__alpha': 0.5,
 'ensemble__bnb__fit_prior': True,
 'ensemble__lgreg__C': 10.0,
 'ensemble__lgreg__max_iter': 100,
 'ensemble__lgreg__penalty': 'l1',
 'ensemble__lgreg__solver': 'liblinear',
 'ensemble__rndf__max_depth': 10,
 'ensemble__rndf__min_samples_leaf': 4,
 'ensemble__rndf__min_samples_split': 10,
 'ensemble__rndf__n_estimators': 600,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__max_df': 1.0,
 'vect__max_features': 10000,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': 'english'}


In [65]:
test_df["Label"] = clf.predict(test_df['TweetText'])


In [68]:
print(test_df["Label"])

0         Sports
1         Sports
2       Politics
3       Politics
4         Sports
          ...   
2605      Sports
2606      Sports
2607      Sports
2608    Politics
2609      Sports
Name: Label, Length: 2610, dtype: object


## Creating the Submission File

In [71]:
results = test_df[["TweetId", "Label"]].reset_index(drop=True)
results.to_csv("submission.csv", index=False)