In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import pandas as pd
import csv
import numpy as np
import nltk
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk.corpus 
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import  LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from pprint import pprint
import string 
import re 
import gc
import glob
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/zip-tweets/Data/cricket_hashtag.csv
/kaggle/input/zip-tweets/Data/football_hashtag.csv
/kaggle/input/zip-tweets/Data/machine learning_hashtag.csv
/kaggle/input/zip-tweets/Data/mobiles_hashtag.csv
/kaggle/input/zip-tweets/Data/happy birthday_hashtag.csv
/kaggle/input/zip-tweets/Data/bollywood_hashtag.csv
/kaggle/input/zip-tweets/Data/Politics_hashtag.csv
/kaggle/input/zip-tweets/Data/hollywood_hashtag.csv
/kaggle/input/zip-tweets/Data/bigboss_hashtag.csv
/kaggle/input/zip-tweets/Data/food_hashtag.csv


# Stitching all files together

In [2]:

def stitch_csv(dirname):
    extension='csv'
    all_files=[i for i in glob.glob(os.path.join(dirname,'*.{}'.format(extension)))]
    combined_file=[]
    for f in all_files:
        file=pd.read_csv(f)
        file['label']=os.path.splitext(os.path.basename(f))[0]
        combined_file.append(file)
    combined_csv=pd.concat(combined_file,ignore_index=True)
    return combined_csv

csv_file=stitch_csv(dirname)

# Data Check

In [None]:
#data observation

def data_obs(data):
    print("dataset size:")
    print(data.shape)
    print(data.head(10))


data_obs(csv_file) #function can be used to check the dimensions of dataset

In [None]:
#distribution of classes for prediction
def create_distribution(dataFile):
    
    return sb.countplot(x='label', data=dataFile, palette='hls')

create_distribution(csv_file)

In [None]:
#data integrity check i.e, we need to check for null entries in all the three variables.
#none of the datasets contains missing values therefore no cleaning required
def data_qualityCheck(data):
    
    print("Check Started...")
    print(data.isnull().sum())
    print("Check Started...")
    data.info()
        
    print("Check finished.")

    
# data_qualityCheck() can be run to see the quality check results as well as integrity constraints.
data_qualityCheck(csv_file)

# ML Pre-Processing

In [None]:
text=pd.Series(csv_file.iloc[:]['text'])

In [None]:
print(csv_file.iloc[:10]['label'])

In [None]:
#Converting label values into no. for classification purpose
def to_label(data):
    lst=[]
    for label in data:
        
        if label == "happy birthday_hashtag":
            lst.append(0)
        elif label == "food_hashtag":
            lst.append(1)
        elif label == "football_hashtag":
            lst.append(2)
        elif label == "cricket_hashtag":
            lst.append(3)
        elif label == "Politics_hashtag":
            lst.append(4)
        elif label == "machine learning_hashtag":
            lst.append(5)
        elif label == "hollywood_hashtag":
            lst.append(6)
        elif label == "bollywood_hashtag":
            lst.append(7)
        elif label == "bigboss_hashtag":
            lst.append(8)
        elif label == "mobiles_hashtag":
            lst.append(9)
        else:
            print("AAAAAA")
    return lst
    

label=pd.Series(to_label(csv_file['label']))

In [None]:
def clean_text(text):
        
        if text:
            
            text1 = re.sub('^b','', str(text))
            text1 = re.sub(r'(\\x[a-z]*[0-9]*)','', str(text1))
            text2=re.sub('(\s[hhtps:]\S)\s','', str(text1))
            text2=text2.lower()
        else:
            pass

        return text2

for i,j in enumerate(text):
    text[i]=clean_text(j)

In [None]:
#performing train test split
train_x,test_x,train_y,test_y=train_test_split(text,label,test_size=0.1,random_state=1)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

# ML Feature-Selection

In [None]:
"""
Before we can train an algorithm to classify labels, we need to extract features from it. It means reducing the mass
of unstructured data into some uniform set of attributes that an algorithm can understand. For tweet classification, it could be 
tf-idf along with n-grams. 
"""



#we will start with simple bag of words technique 
#creating feature vector - document term matrix
countV = CountVectorizer()
train_count = countV.fit_transform(train_x.values)



#print training doc term matrix
def get_countVectorizer_stats():
    
    #vocab size
    print(train_count.shape)

    #check vocabulary using below command
    #print(countV.vocabulary_)

    #get feature names
    #print(countV.get_feature_names()[:25])
get_countVectorizer_stats()

#create tf-df frequency features
#tf-idf 
tfidfV = TfidfTransformer()
train_tfidf = tfidfV.fit_transform(train_count)

def get_tfidf_stats():
    print(train_tfidf.shape)
    #get train data feature names 
    print(train_tfidf.A[:10])
    
get_tfidf_stats()


tfidf_ngram = TfidfVectorizer(stop_words='english',ngram_range=(1,4),use_idf=True,smooth_idf=True)

     

# ML Model And Prediction

In [None]:

#naive-bayes classifier
nb_pipeline_ngram = Pipeline([
        ('nb_tfidf',tfidf_ngram),
        ('nb_clf',MultinomialNB())])

nb_pipeline_ngram.fit(train_x,train_y)
predicted_nb_ngram = nb_pipeline_ngram.predict(test_x)
print(np.mean(predicted_nb_ngram == test_y))

#========================================================================================

#=========================================================================================

print(classification_report(test_y, predicted_nb_ngram))

print(test_y.shape)




# Deep Learning Feature Selection

In [4]:
from nltk.corpus import stopwords
from nltk import word_tokenize

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
csv_file['text'] = csv_file['text'].apply(clean_text)
csv_file['text'] = csv_file['text'].str.replace('\d+', '')


In [7]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(csv_file['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 64002 unique tokens.


In [8]:
X = tokenizer.texts_to_sequences(csv_file['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (30000, 250)


In [9]:
Y = pd.get_dummies(csv_file['label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (30000, 10)


In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(27000, 250) (27000, 10)
(3000, 250) (3000, 10)


# Deep Learning Model

In [15]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 250, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1010      
Total params: 5,081,410
Trainable params: 5,081,410
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
epochs = 5
batch_size =32

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


# Prediction Scores

In [28]:
lst=[]
for i in predicted_val:
    lst.append(np.argmax(i))
Pred_Y=pd.get_dummies(lst).values
predicted_val = model.predict(X_test)
print(np.mean(Pred_Y ==Y_test))

#========================================================================================

#=========================================================================================

print(classification_report(Y_test, Pred_Y))

print(Y_test.shape)




0.9918
              precision    recall  f1-score   support

           0       0.96      0.82      0.88        83
           1       0.99      0.99      0.99       442
           2       0.96      0.96      0.96       517
           3       0.99      0.96      0.98       485
           4       0.91      0.97      0.94       268
           5       0.96      0.97      0.96       510
           6       0.94      0.93      0.94       151
           7       0.94      0.95      0.95       483
           8       1.00      0.78      0.88        27
           9       0.97      0.91      0.94        34

   micro avg       0.96      0.96      0.96      3000
   macro avg       0.96      0.93      0.94      3000
weighted avg       0.96      0.96      0.96      3000
 samples avg       0.96      0.96      0.96      3000

(3000, 10)
