# Import the required libraries

In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_auc_score 

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer

from warnings import filterwarnings
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline

# Import datasets

In [2]:
train_dataset = pd.read_csv("/content/train.csv")

train_dataset

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...
...,...,...,...
6520,296675082267410433,Politics,'Photo: PM has laid a wreath at Martyrs Monume...
6521,306677536195231746,Sports,'The secret of the Chennai pitch - crumbling o...
6522,306451295307431937,Sports,@alinabhutto he isn't on Twitter either
6523,306088574221176832,Sports,'Which England player would you take out to di...


In [3]:
train_dataset.describe()

Unnamed: 0,TweetId
count,6525.0
mean,2.887131e+17
std,5.139819e+16
min,23909310000.0
25%,2.94138e+17
50%,3.025319e+17
75%,3.053242e+17
max,3.068341e+17


In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6525 entries, 0 to 6524
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   TweetId    6525 non-null   int64 
 1   Label      6525 non-null   object
 2   TweetText  6525 non-null   object
dtypes: int64(1), object(2)
memory usage: 153.1+ KB


In [5]:
test_dataset = pd.read_csv("/content/test.csv")
test_dataset

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...
...,...,...
2605,282023761044189184,'Qualifier 1 and Eliminator games will be play...
2606,303879735006601216,"@reesedward Hi Edward, it's not a #peacekeepin..."
2607,297956846046703616,'Perera was @SunRisersIPL first #IPL purchase ...
2608,304265049537658880,"'#SecKerry: Thanks to Senator @TimKaine, @RepR..."


# Clean dataset

In [6]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
import nltk
nltk.download('omw-1.4')

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words(['english'])

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
def cleaned(data):
    #remove urls
    tweet= re.sub(r'http\S+',' ', data) 

    #remove hashtags
    tweet= re.sub(r'#\w+', ' ', tweet)  

    #3. Remove mentions and characters that not in the English alphabets
    tweet = re.sub(r'@\w+',' ', tweet)
    precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet)

    #2. Tokenize
    tweet = TweetTokenizer().tokenize(precleaned_tweet) 

    #3. Remove Puncs
    tokens = [w for w in tweet if w.isalpha()]  

     #4. Removing Stopwords
    tokens = [t for t in tokens if t not in stop_words] 

    #5. lemma
    text_cleaned = [lem.lemmatize(t) for t in tokens]  
    
    #6. Joining
    return " ".join(text_cleaned)  

train_dataset['TweetTextCleaned'] = train_dataset['TweetText'].apply(cleaned)

In [8]:
train_dataset

Unnamed: 0,TweetId,Label,TweetText,TweetTextCleaned
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,The value measured dollar term deepest America...
1,304834304222064640,Politics,'@rraina1481 I fear so',I fear
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,Watch video highlight final Australia West Indies
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,RT At Nitro Circus
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,Always good thing Thanks feedback
...,...,...,...,...
6520,296675082267410433,Politics,'Photo: PM has laid a wreath at Martyrs Monume...,Photo PM laid wreath Martyrs Monument Algiers
6521,306677536195231746,Sports,'The secret of the Chennai pitch - crumbling o...,The secret Chennai pitch crumbling edge solid ...
6522,306451295307431937,Sports,@alinabhutto he isn't on Twitter either,Twitter either
6523,306088574221176832,Sports,'Which England player would you take out to di...,Which England player would take dinner Featuri...


In [9]:
result = train_dataset.groupby('Label')['TweetId'].nunique()
result

Label
Politics    3200
Sports      3325
Name: TweetId, dtype: int64

# Encoding feature label

In [10]:
label_enc = {"Label":     {"Sports": 0, "Politics": 1}}
train_data = train_dataset.replace(label_enc)


In [11]:
train_data

Unnamed: 0,TweetId,Label,TweetText,TweetTextCleaned
0,304271250237304833,1,'#SecKerry: The value of the @StateDept and @U...,The value measured dollar term deepest America...
1,304834304222064640,1,'@rraina1481 I fear so',I fear
2,303568995880144898,0,'Watch video highlights of the #wwc13 final be...,Watch video highlight final Australia West Indies
3,304366580664528896,0,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,RT At Nitro Circus
4,296770931098009601,0,'@cricketfox Always a good thing. Thanks for t...,Always good thing Thanks feedback
...,...,...,...,...
6520,296675082267410433,1,'Photo: PM has laid a wreath at Martyrs Monume...,Photo PM laid wreath Martyrs Monument Algiers
6521,306677536195231746,0,'The secret of the Chennai pitch - crumbling o...,The secret Chennai pitch crumbling edge solid ...
6522,306451295307431937,0,@alinabhutto he isn't on Twitter either,Twitter either
6523,306088574221176832,0,'Which England player would you take out to di...,Which England player would take dinner Featuri...


# Sample stratification

In [12]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

X = train_data['TweetTextCleaned'].to_numpy()
y = train_data['Label'].to_numpy()
for train_data_index, test_data_index in sss.split(X, y):
    
    X_train, X_test = X[train_data_index], X[test_data_index]

    y_train, y_test = y[train_data_index], y[test_data_index]

In [13]:
#function pipeline
tweets_pipe = Pipeline([('CVec', CountVectorizer(stop_words='english')),
                     ('Tfidf', TfidfTransformer())])

X_train_tranformed = tweets_pipe.fit_transform(X_train)

X_test_tranformed = tweets_pipe.transform(X_test)

# Model RidgeClassifier

In [14]:
#Classifiers : Supervised Machine Learning Algorithms
classifiers = {
    'RidgeClassifier': RidgeClassifier(class_weight='balanced')
}

# Model Evaluation

In [15]:
no_classifiers = len(classifiers.keys())

from time import process_time 

#Results
def batch_classify(X_train_tranformed, y_train, X_test_tranformed, y_test, verbose = True):
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['Classifier', 'Area Under Curve', 'Training time'])
    count = 0
    for key, classifier in classifiers.items():
        t_start = process_time()  
        classifier.fit(X_train_tranformed, y_train)
        t_stop = process_time() 
        t_elapsed = t_stop - t_start
        y_predicted = classifier.predict(X_test_tranformed)
        
        df_results.loc[count,'Classifier'] = key
        df_results.loc[count,'Area Under Curve'] = roc_auc_score(y_test, y_predicted)
        df_results.loc[count,'Training time'] = t_elapsed
        if verbose:
            print("trained {c} in {f:.2f} s".format(c=key, f=t_elapsed))
        count+=1

    return df_results


In [16]:

df_results = batch_classify(X_train_tranformed, y_train,X_test_tranformed, y_test)
print(df_results.sort_values(by='Area Under Curve', ascending=False))

trained RidgeClassifier in 0.02 s
        Classifier  Area Under Curve  Training time
0  RidgeClassifier          0.913939       0.021375


In [17]:

test_dataset['cleaned'] = test_dataset['TweetText'].apply(lambda x : cleaned(x))
cleanup_nums = {"Label":     {"Politics": 0, "Sports": 1}}
test = test_dataset.replace(cleanup_nums)

In [18]:
x = test_dataset['cleaned'].to_numpy()


In [19]:
x_test_tranformed = tweets_pipe.transform(x)

# Predictions

In [20]:
Classifier = RidgeClassifier()
Classifier.fit(X_train_tranformed, y_train)
y_predicted = Classifier.predict(x_test_tranformed)
y_predicted




array([0, 0, 1, ..., 0, 0, 0])

In [21]:
test_result = pd.Series(y_predicted, name = "Label").astype(int)
test_result

0       0
1       0
2       1
3       1
4       0
       ..
2605    0
2606    0
2607    0
2608    0
2609    0
Name: Label, Length: 2610, dtype: int64

# Import results to a csv file

In [22]:
results = pd.concat([test_dataset["TweetId"], test_result],axis = 1)
cleanup_nums = {"Label":     {0: "Politics", 1: "Sports"}}
results = results.replace(cleanup_nums)
results.to_csv("tweet_sub.csv", index = False)

In [23]:
tweet_sub_df = pd.read_csv("tweet_sub.csv")

In [24]:
tweet_sub_df.sample(10)

Unnamed: 0,TweetId,Label
1292,306745571924905985,Politics
1092,234767580588564480,Politics
291,302051129221607424,Sports
1093,305957211878875136,Politics
379,292402320862482432,Sports
1511,234286233717723136,Politics
1119,278310707710279680,Sports
1808,294978232611127296,Sports
1176,305679563596374018,Politics
22,304684623068282880,Sports


In [25]:
resultt = tweet_sub_df.groupby('Label')['TweetId'].nunique()
resultt

Label
Politics    1495
Sports      1115
Name: TweetId, dtype: int64

# End