In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deeptweets/sample_submission.csv
/kaggle/input/deeptweets/training.csv
/kaggle/input/deeptweets/train.csv
/kaggle/input/deeptweets/test.csv


# **Imports used throughout this task**

In [6]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scipy as sp
import numpy as np
from sklearn.metrics import accuracy_score
import string
import nltk
nltk.download("omw-1.4")

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# TweetText Preprocessing

In [7]:
def preprocess(text):
    
    punctuation = string.punctuation
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # remove urls
    text = re.sub(r'https?://\S+', '', text, flags=re.MULTILINE)
    
    # remove punctuations and usernames and lemmatization
    table = str.maketrans('', '', punctuation)
    words = [w.translate(table).lower() for w in text.split(" ")]    
    
    # remove usernames and tags and stopwords
    words = [w for w in words if len(w)>=1 and w[0] is not "@" and w[0] is not "#" and w not in stop_words]
    words = set(words)
    
    # lemmatization
    words = [lemmatizer.lemmatize(w) for w in words if len(w)>1]
    
    return words
    

In [8]:

train =  pd.read_csv("/kaggle/input/deeptweets/train.csv")
test =  pd.read_csv("/kaggle/input/deeptweets/test.csv")

In [9]:
train["TweetText"] = train["TweetText"].apply(lambda x: preprocess(x))
print(train)

                 TweetId     Label  \
0     304271250237304833  Politics   
1     304834304222064640  Politics   
2     303568995880144898    Sports   
3     304366580664528896    Sports   
4     296770931098009601    Sports   
...                  ...       ...   
6520  296675082267410433  Politics   
6521  306677536195231746    Sports   
6522  306451295307431937    Sports   
6523  306088574221176832    Sports   
6524  277090953242759169  Politics   

                                              TweetText  
0     [deepest, value, value, dollar, term, usaid, m...  
1                                    [rraina1481, fear]  
2     [wwc13, highlight, australia, west, video, ind...  
3     [circus, ausgrandprix, nitro, cantwaitforausgp...  
4     [feedback, good, thanks, thing, cricketfox, al...  
...                                                 ...  
6520  [algeria, photo, algiers, laid, pminafrica, wr...  
6521  [like, middle, pitch, solid, secret, edge, bar...  
6522               [i

In [10]:
# Load the training data
test['TweetText'] = test['TweetText'].apply(lambda x: preprocess(x))
print(test['TweetText'])

0       [mason, threaten, throw, yard, side, home, sta...
1       [asking, series, pound, see, mass, unit, avoir...
2       [sea, black, along, sochi2014, construction, s...
3       [foreign, minister, seckerryu2019s, video, mee...
4       [ponting, price, iplauction, 400000, go, playe...
                              ...                        
2605    [played, may, game, chidambaram, home, runners...
2606    [reesedward, list, edward, 16, guineabissau, h...
2607    [sunrisersipl, iplauction, perera, first, purc...
2608    [reproberthurt, statedept4us, thanks, senator,...
2609    [official, pinterest, account, shankly, lfc, p...
Name: TweetText, Length: 2610, dtype: object


# Features Extraction

In [11]:
train_words_count={}

for i in range(len(train)):
    for j in train['TweetText'][i]:
        if j in train_words_count.keys():
            train_words_count[j]+=1
        else:
            train_words_count[j]=1
train_words_count

{'deepest': 2,
 'value': 9,
 'dollar': 9,
 'term': 19,
 'usaid': 5,
 'measured': 2,
 'statedept': 10,
 'seckerry': 51,
 'american': 65,
 'rraina1481': 4,
 'fear': 8,
 'wwc13': 30,
 'highlight': 31,
 'australia': 105,
 'west': 47,
 'video': 181,
 'indie': 16,
 'watch': 149,
 'final': 159,
 'circus': 2,
 'ausgrandprix': 45,
 'nitro': 2,
 'cantwaitforausgp': 29,
 'chelscanlan': 12,
 'albertpark': 8,
 'chadwickmodels': 1,
 'rt': 873,
 'theymakeitlooksoeasy': 1,
 'feedback': 28,
 'good': 103,
 'thanks': 140,
 'thing': 41,
 'cricketfox': 1,
 'always': 26,
 'fiscal': 19,
 'dr': 27,
 'policy': 52,
 'rajan': 11,
 'space': 11,
 'create': 28,
 'monetary': 4,
 'consolidation': 4,
 'act': 35,
 'spending': 19,
 'leave': 12,
 'congress': 54,
 'stop': 25,
 '800000': 2,
 'employee': 6,
 'jobsnotcuts': 11,
 'take': 151,
 'doesnt': 19,
 'impending': 2,
 'cut': 57,
 'fact': 35,
 'defense': 12,
 'forced': 3,
 'unpaid': 1,
 '1st': 237,
 'wkt': 10,
 '39': 10,
 '1014': 2,
 'clarke': 55,
 '24': 34,
 'test': 33

In [12]:
test_words_count={}
for i in range(len(test)):
    for j in test['TweetText'][i]:
        if j in test_words_count.keys():
            test_words_count[j]+=1
        else:
            test_words_count[j]=1
test_words_count

{'mason': 3,
 'threaten': 2,
 'throw': 2,
 'yard': 3,
 'side': 14,
 'home': 17,
 'stab': 1,
 'wide': 5,
 'long': 11,
 '28': 8,
 'get': 57,
 'bennett': 3,
 'end': 29,
 'asking': 2,
 'series': 12,
 'pound': 2,
 'see': 45,
 'mass': 3,
 'unit': 1,
 'avoirdupois': 1,
 'derives': 1,
 'abbreviation': 1,
 'thx': 1,
 'aulia': 1,
 'mrbrown': 5,
 'sea': 1,
 'black': 3,
 'along': 3,
 'sochi2014': 24,
 'construction': 3,
 'shore': 1,
 'foreign': 39,
 'minister': 35,
 'seckerryu2019s': 5,
 'video': 55,
 'meeting': 31,
 'remark': 18,
 'westerwelle': 4,
 'transcript': 8,
 'ponting': 3,
 'price': 17,
 'iplauction': 127,
 '400000': 4,
 'go': 72,
 'player': 47,
 'mumbai': 2,
 'first': 81,
 'usd': 1,
 'ricky': 2,
 'base': 14,
 'indian': 33,
 'hammer': 4,
 'begun': 2,
 'viswanathan': 3,
 'caruana': 3,
 'zurich': 3,
 'draw': 6,
 'anand': 4,
 'fabiano': 3,
 'kop': 15,
 'u': 107,
 'email': 7,
 'game': 48,
 'send': 13,
 'start': 26,
 'tweet': 25,
 'text': 6,
 'tonight': 40,
 'lfctv': 8,
 'lfctvliverpoolfccom':

In [13]:
for i in train_words_count.keys():
    if i not in test_words_count:
        test_words_count[i] = 0
test_words_count

{'mason': 3,
 'threaten': 2,
 'throw': 2,
 'yard': 3,
 'side': 14,
 'home': 17,
 'stab': 1,
 'wide': 5,
 'long': 11,
 '28': 8,
 'get': 57,
 'bennett': 3,
 'end': 29,
 'asking': 2,
 'series': 12,
 'pound': 2,
 'see': 45,
 'mass': 3,
 'unit': 1,
 'avoirdupois': 1,
 'derives': 1,
 'abbreviation': 1,
 'thx': 1,
 'aulia': 1,
 'mrbrown': 5,
 'sea': 1,
 'black': 3,
 'along': 3,
 'sochi2014': 24,
 'construction': 3,
 'shore': 1,
 'foreign': 39,
 'minister': 35,
 'seckerryu2019s': 5,
 'video': 55,
 'meeting': 31,
 'remark': 18,
 'westerwelle': 4,
 'transcript': 8,
 'ponting': 3,
 'price': 17,
 'iplauction': 127,
 '400000': 4,
 'go': 72,
 'player': 47,
 'mumbai': 2,
 'first': 81,
 'usd': 1,
 'ricky': 2,
 'base': 14,
 'indian': 33,
 'hammer': 4,
 'begun': 2,
 'viswanathan': 3,
 'caruana': 3,
 'zurich': 3,
 'draw': 6,
 'anand': 4,
 'fabiano': 3,
 'kop': 15,
 'u': 107,
 'email': 7,
 'game': 48,
 'send': 13,
 'start': 26,
 'tweet': 25,
 'text': 6,
 'tonight': 40,
 'lfctv': 8,
 'lfctvliverpoolfccom':

In [14]:
# add missing words from train data present in test data
for i in test_words_count.keys():
    if i not in train_words_count:
        train_words_count[i] = 0

In [15]:
for i in train_words_count.keys():
    train[i] = 0

# Set values in the new columns
for i in range(len(train)):
    for j in train['TweetText'][i]:
        train.loc[i, j] = 1

  


In [16]:
for i in test_words_count.keys():
    test[i] = 0

# Set values in the new columns
for i in range(len(test)):
    for j in test['TweetText'][i]:
        test.loc[i, j] = 1

  


In [17]:
train.drop(['TweetId', 'TweetText'], axis=1, inplace=True)
test.drop(['TweetId', 'TweetText'], axis=1, inplace=True)

In [18]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder().fit(train["Label"].tolist())
y = label_encoder.transform(train["Label"].tolist())
y

array([0, 0, 1, ..., 1, 1, 0])

In [19]:
X = train.drop(['Label'], axis=1)
X.head()

Unnamed: 0,deepest,value,dollar,term,usaid,measured,statedept,seckerry,american,rraina1481,...,soturelated,lazio,preserved,twostate,eliminator,reesedward,reproberthurt,charlottesville,timkaine,shankly
0,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model Training

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.230)

In [21]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)


0.9982085987261147

In [22]:
accuracy = clf.score(X_train, y_train)
print(f"Accuracy on validation set: {accuracy:.2f}")

Accuracy on validation set: 1.00


In [23]:
from sklearn.metrics import classification_report
pred = clf.predict(X_test)
print(classification_report(y_test, pred, target_names=("Politics", "Sports")))

              precision    recall  f1-score   support

    Politics       0.95      0.94      0.95       730
      Sports       0.94      0.96      0.95       771

    accuracy                           0.95      1501
   macro avg       0.95      0.95      0.95      1501
weighted avg       0.95      0.95      0.95      1501



# Model Testing - Prediction 

In [26]:
def classify(label):
    
    if label == 0:
        return "Politics" 
    return "Sports"


test_data = pd.read_csv("/kaggle/input/deeptweets/test.csv")


test_data['Label']=[classify(label) for label in clf.predict(test)]
test_data.to_csv('predictions.csv',index=False)

Feature names must be in the same order as they were in fit.

