<a href="https://colab.research.google.com/github/Aryabhatt-O/Text-Processing/blob/main/Kaggle_twitter_disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
# from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')

In [None]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
import pandas as pd

train_data = pd.read_csv(io.StringIO(uploaded['train.csv'].decode("utf-8")))


In [None]:
train_data

In [None]:
y = train_data.target.values
xtrain, xvalid, ytrain, yvalid = train_test_split(train_data.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [None]:
train_data

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [None]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [None]:
A = xtrain_tfv.T

In [None]:
import numpy as np
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(A)
# u, s, v = np.linalg.svd(A, full_matrices=True)

In [None]:
clf_lr = LogisticRegression(C = 1.0, penalty ='l2')
clf_lr.fit(xtrain_ctv, ytrain)
predictions = clf_lr.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
RIDGE_MODEL = RidgeClassifier(alpha=0.005994842503189409, random_state=13)


RIDGE_MODEL.fit(xtrain_ctv, ytrain)
# predictions = RIDGE_MODEL.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
clf = MultinomialNB(alpha=2.782559402207126)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))


In [None]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(xtrain_ctv, ytrain)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [None]:
clf_lr = LogisticRegression(C = 1.0, penalty ='l2')
clf_lr.fit(xtrain_ctv, ytrain)
predictions = clf_lr.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_ctv)
xtrain_svd = svd.transform(xtrain_ctv)
xvalid_svd = svd.transform(xvalid_ctv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [None]:
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
train_data['Text_without_stopwords'] = train_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [None]:
train_data['Text_stemmed_without_no'] = train_data['Text_without_stopwords'].str.replace('\d+', '')

In [None]:
train_data["Text_stemmed_without__stopwords_number_punc"] = train_data['Text_stemmed_without_no'].str.replace('[^\w\s]','')

In [None]:
train_data.drop(['text', 'Text_without_stopwords','Text_stemmed_without_no',], axis=1, inplace=True)

In [None]:
import nltk
sno = nltk.stem.SnowballStemmer('english')

In [None]:
train_data['Text'] = train_data['Text_stemmed_without__stopwords_number_punc'].apply(lambda x: [sno.stem(x)]) 

In [None]:

train_data.drop('Text_stemmed_without__stopwords_number_punc',axis=1, inplace=True)

In [None]:
train_data['Final_text']=[" ".join(review) for review in train_data['Text'].values]
train_data.drop('Text',axis=1, inplace=True)

In [None]:
train_data.drop('keyword',axis=1, inplace=True)
train_data.drop('location',axis=1, inplace=True)
train_data

In [None]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing


In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the  tweets in the data
train_vectors = count_vectorizer.fit_transform(train_data["Final_text"])

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_vectors,train_data["target"])

In [None]:
from sklearn.linear_model import LogisticRegression
clf_lr=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')

In [None]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
scores = model_selection.cross_val_score(clf, train_vectors, train_data["target"], cv=3, scoring="f1")
scores

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
test_data = pd.read_csv(io.StringIO(uploaded['test.csv'].decode("utf-8")))

In [None]:
ctv_test = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv_test.fit(list(test_data.text.values))
ctV_test =  ctv.transform(test_data.text.values) 
# xvalid_ctv = ctv.transform(xvalid)

In [None]:
predicted_3rd_attempt = RIDGE_MODEL.predict(ctV_test)

In [None]:
# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
test_data['Text_without_stopwords'] = test_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [None]:
test_data['Text_stemmed_without_no'] = test_data['Text_without_stopwords'].str.replace('\d+', '')

In [None]:
test_data["Text_stemmed_without__stopwords_number_punc"] = test_data['Text_stemmed_without_no'].str.replace('[^\w\s]','')

In [None]:
test_data.drop(['text', 'Text_without_stopwords','Text_stemmed_without_no',], axis=1, inplace=True)

In [None]:
import nltk
sno = nltk.stem.SnowballStemmer('english')

In [None]:
test_data['Text'] = test_data['Text_stemmed_without__stopwords_number_punc'].apply(lambda x: [sno.stem(x)]) 

In [None]:
test_data.drop('Text_stemmed_without__stopwords_number_punc',axis=1, inplace=True)

In [None]:
test_data['Final_text']=[" ".join(review) for review in test_data['Text'].values]
test_data.drop('Text',axis=1, inplace=True)

In [None]:
test_data.drop('keyword',axis=1, inplace=True)
test_data.drop('location',axis=1, inplace=True)
test_data

In [None]:
test_vectors = count_vectorizer.transform(test_data["Final_text"])

In [None]:
predicted = clf.predict(test_vectors)


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
sample_submission  = pd.read_csv(io.StringIO(uploaded['submission.csv'].decode("utf-8")))

In [None]:
sample_submission["target"] = predicted_3rd_attempt

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv("submission3.csv", index=False)

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_data["Final_text"])

In [None]:
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

#Cleaning the text

import string
def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return [stemmer.lemmatize(word) for word in nopunc]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
data = train_data['Final_text']
tfidfconvert = TfidfVectorizer(analyzer=text_process,ngram_range=(1,3)).fit(data)

X_transformed=tfidfconvert.transform(data)

# Clustering the training sentences with K-means technique

from sklearn.cluster import KMeans
modelkmeans = KMeans(n_clusters=3, init='k-means++', n_init=100)
modelkmeans.fit(X_transformed)

In [None]:
from sklearn.metrics import accuracy_score
labels = modelkmeans.labels_
print(accuracy_score(labels,train_data['target']))

In [None]:
from sklearn.cluster import KMeans

Sum_of_squared_distances = []
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(X_transformed)
    Sum_of_squared_distances.append(km.inertia_)


import matplotlib.pyplot as plt

plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
data = train_data['Final_text']


tf_idf_vectorizor = TfidfVectorizer(stop_words = 'english',#tokenizer = tokenize_and_stem,
                             max_features = 20000)
tf_idf = tf_idf_vectorizor.fit_transform(data)
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

In [None]:
train_vectors = count_vectorizer.fit_transform(train_data["Final_text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
# test_vectors = count_vectorizer.transform(test_df["text"])