<a href="https://colab.research.google.com/github/DebankurS/NLP-Projects/blob/master/Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import nltk
nltk.download('stopwords')
pd.set_option('display.max_colwidth', -1)

In [0]:
train_data=pd.read_csv("https://raw.githubusercontent.com/DebankurS/NLP-Projects/master/train.csv",sep="~",index_col=None)
train_data.head()

In [0]:
browsers = train_data['Browser_Used'].tolist()
set(browsers)

In [0]:
devices = train_data['Device_Used'].tolist()
set(devices)

In [0]:
response = train_data['Is_Response'].tolist()
set(response)

In [0]:
train_text_data = train_data.drop(['User_ID', 'Browser_Used', 'Device_Used', 'Is_Response'],axis=1)

In [0]:
train_text_data.head()

In [0]:
train_categorical_data = train_data.drop(['User_ID', 'Description', 'Is_Response'],axis=1)

In [0]:
train_categorical_data.head()

In [0]:
target_data = train_data.drop(['User_ID', 'Description', 'Browser_Used', 'Device_Used'], axis=1)

In [0]:
target_data.head()

In [0]:
import pickle
pickle.dump((train_text_data, train_categorical_data, target_data), open('dataset.pkl', 'wb'))

In [0]:
from nltk.corpus import stopwords
from string import punctuation
import re
def sentence_to_words(sentence):
    
    sentence = re.sub(r"[^a-zA-Z0-9]", " ", sentence.lower()) # Convert to lower case
    words = sentence.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    return words

In [0]:
train_data.Description[10]

In [0]:
sentence_to_words(train_data.Description[1])

In [0]:
from sklearn.externals import joblib
import pickle
def preprocess_data(data,cache_file="preprocessed_data.pkl"):
    
    cache_data = None
    if cache_file is not None:
        try:
            with open(cache_file,"rb") as f:
                cache_data=pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass
        
    if cache_data is None:
        words_train=[sentence_to_words(sentence) for sentence in data]
        if cache_file is not None:
            with open(cache_file,"wb") as f:
                pickle.dump(cache_data,f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words_train=cache_data
    
    return words_train

In [0]:
tr=preprocess_data(train_data.Description)

In [0]:
import pickle
train_text_data, train_categorical_data, target_data = pickle.load(open('dataset.pkl', 'rb'))

In [0]:
text_data = train_text_data['Description'].tolist()

In [0]:
all_text = ' '.join(text_data)
words = all_text.split()

In [0]:
from collections import Counter

counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

text_data_ints = []
for text in text_data:
    text_data_ints.append([vocab_to_int[word] for word in text.split()])

In [0]:
def pad_features(text_ints, seq_length):
    features = np.zeros((len(text_ints), seq_length), dtype=int)

    for i, row in enumerate(text_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [0]:
features = pad_features(text_data_ints, seq_length=1000).tolist()

In [0]:
text_data_df = pd.DataFrame({'Description':features})

In [0]:
import numpy as np
output_feature = 'Is_Response'
target_data = target_data['Is_Response'].tolist()
encoded_target = np.array([1 if target.lower() == 'good' else 0 for target in target_data])

In [0]:
encoded_target_df = pd.DataFrame(data=encoded_target, columns=['Is_Response'])

In [0]:
categorical_features = ['Browser_Used', 'Device_Used']

In [0]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for cat_col in categorical_features:
  label_encoders[cat_col] = LabelEncoder()
  train_categorical_data[cat_col] = label_encoders[cat_col].fit_transform(train_categorical_data[cat_col])

In [0]:
print(len(text_data_df))
text_data_df.head()

In [0]:
print(len(train_categorical_data))
train_categorical_data.head()

In [0]:
print(len(encoded_target_df))
encoded_target_df.head()

In [0]:
dataset_df = pd.concat([text_data_df, train_categorical_data, encoded_target_df], axis=1)

In [0]:
dataset_df.head()

In [0]:
from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
  def __init__(self, data, text_cols, cat_cols, output_col):

    self.n = data.shape[0]
   
    self.y_col = output_col

    self.cat_cols = cat_cols
    
    self.text_cols = text_cols
    self.text_X = data['Description'].values

    self.cat_X = data[self.cat_cols].astype(np.int64).values
    
    self.y = data[self.y_col].values
  def __len__(self):
    return self.n

  def __getitem__(self, idx):
    return [self.y[idx], self.text_X[idx], self.cat_X[idx]]

In [0]:
dataset = TabularDataset(dataset_df, text_data_df, categorical_features, output_feature)