In [155]:
import pandas as pd
import numpy as np
from collections import Counter
import tensorflow as tf
from tensorflow import keras


In [156]:
english_df = pd.read_csv("english_text.csv")
hinglish_df = pd.read_csv("hinglish_text.csv")

In [157]:
english_df["label"] = 0
hinglish_df["label"] = 1


In [172]:
mixed_df = english_df.append(hinglish_df)
random_mixed_df = mixed_df.sample(len(mixed_df)).values
random_mixed_df= random_mixed_df[:,1:]

labels = []
texts= []
for i in range(len(random_mixed_df)):
    
    texts.append(random_mixed_df[i][0])
    labels.append(random_mixed_df[i][1])
    


In [196]:
english_counts = Counter()
hinglish_counts = Counter()
total_counts = Counter()

In [197]:
for i in range(len(texts)):
    r =texts[i].split(' ')
    if(labels[i] == 0):
        english_counts.update(r)
    else:
        hinglish_counts.update(r)
    
    total_counts.update(r)
    r=[]

In [198]:
total_counts['the']

49536

In [199]:
layer_0 = []
for sentence in texts:
    temp=[]
    words = sentence.split(' ')
    
    for x in words:
        temp.append(total_counts[x])
    
    layer_0.append(temp)

In [200]:
x_train = keras.preprocessing.sequence.pad_sequences(layer_0,value=0,padding='post',maxlen=971)

In [201]:
len(labels)

59176

In [202]:
vocab_size = len(total_counts)

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16,activation=tf.nn.relu))
model.add(keras.layers.Dense(1,activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          916656    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 916,945
Trainable params: 916,945
Non-trainable params: 0
_________________________________________________________________


In [203]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])


In [204]:
x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = labels[:10000]
partial_y_train = labels[10000:]

In [205]:
partial_x_train.shape

(49176, 971)

In [209]:
history = model.fit(partial_x_train,partial_y_train,epochs=40,batch_size=512,validation_data=(x_val,y_val),verbose=1)

Train on 49176 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [206]:
model.evaluate(x_val,y_val)



[0.6942797678947449, 0.074]

In [267]:
def Testing(sentence):
    
    prediction_vector = []
    temp = []
    word_list = sentence.split(' ')
    
    for x in word_list:
        
        temp.append(total_counts[x])
        
    prediction_vector.append(temp)
 

    vector = keras.preprocessing.sequence.pad_sequences(prediction_vector,value=0,padding='post',maxlen=971)
    
    a = model.predict(vector)
    
    
    
    if(a[0][0]<0.4):
        return("English Sentence")
    else:
        return("Hinglish Sentence")
        

In [272]:
prediction1 = Testing("what are you even talking aout man")
prediction2 = Testing("tum kya baat kar rahe ho yar")

In [273]:
print(prediction1)
print(prediction2)

English Sentence
Hinglish Sentence
