In [0]:
from collections import defaultdict
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint
from sklearn.linear_model import ElasticNet
import numpy as np
import random
import copy
from google.colab import files
from google.colab import drive
import seaborn as sns

%matplotlib inline

In [0]:
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
real_males_df = pd.read_csv("/content/drive/My Drive/Colab_Notebooks/Indian-Male-Names.csv")
real_females_df = pd.read_csv("/content/drive/My Drive/Colab_Notebooks/Indian-Female-Names.csv")

In [0]:
real_males_df.head()

Unnamed: 0,name,gender,race
0,barjraj,m,indian
1,ramdin verma,m,indian
2,sharat chandran,m,indian
3,birender mandal,m,indian
4,amit,m,indian


In [0]:
 real_females_df.head()

Unnamed: 0,name,gender,race
0,shivani,f,indian
1,isha,f,indian
2,smt shyani devi,f,indian
3,divya,f,indian
4,mansi,f,indian


In [0]:
fake_males_df = real_males_df.copy(deep=True)
fake_females_df = real_females_df.copy(deep=True)

In [0]:
fake_males_df.name = fake_males_df.name.apply(lambda x: str(x))
fake_females_df.name = fake_females_df.name.apply(lambda x: str(x))

In [0]:
fake_males_df.name = fake_males_df.name.apply(lambda x: "".join(random.sample(x, len(x))))
fake_females_df.name = fake_females_df.name.apply(lambda x: "".join(random.sample(x, len(x))))
fake_males_df.race = "fake_name"
fake_females_df.race = "fake_name"

In [0]:
fake_males_df.head()

Unnamed: 0,name,gender,race
0,ajrjrab,m,fake_name
1,v inaerdmram,m,fake_name
2,tn cranahhdsraa,m,fake_name
3,rrnblnieedama d,m,fake_name
4,imta,m,fake_name


In [0]:
fake_females_df.head()

Unnamed: 0,name,gender,race
0,shivani,f,fake_name
1,ahis,f,fake_name
2,mivsatn deshyi,f,fake_name
3,vayid,f,fake_name
4,nmias,f,fake_name


In [0]:
real_males_df['real_0/fake_1'] = 0
fake_males_df["real_0/fake_1"] = 1
real_females_df["real_0/fake_1"] = 0
fake_females_df["real_0/fake_1"] = 1

In [0]:
real_males_df.shape, fake_males_df.shape, real_females_df.shape, fake_females_df.shape

((14845, 4), (14845, 4), (15382, 4), (15382, 4))

In [0]:
combined_names_df = real_males_df.append(fake_males_df).append(real_females_df).append(fake_females_df)

In [0]:
combined_names_df.head()

Unnamed: 0,name,gender,race,real_0/fake_1
0,barjraj,m,indian,0
1,ramdin verma,m,indian,0
2,sharat chandran,m,indian,0
3,birender mandal,m,indian,0
4,amit,m,indian,0


In [0]:
combined_names_df.shape

(60454, 4)

In [0]:
combined_names_df = combined_names_df.sample(frac=1).reset_index(drop=True)

In [0]:
combined_names_df.head()

Unnamed: 0,name,gender,race,real_0/fake_1
0,jafauamrsrad h,m,fake_name,1
1,hardeep @ hunny,m,indian,0
2,santosh mahto,m,indian,0
3,aabid,m,indian,0
4,hlrau,m,fake_name,1


In [0]:
combined_names_df.dropna(inplace=True)
combined_names_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60399 entries, 0 to 60453
Data columns (total 4 columns):
name             60399 non-null object
gender           60399 non-null object
race             60399 non-null object
real_0/fake_1    60399 non-null int64
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [0]:
le = LabelEncoder()
le.fit(combined_names_df.gender)
print(le.classes_)
encoded_gender = le.transform(combined_names_df.gender)
combined_names_df.gender = encoded_gender
combined_names_df.info()

['f' 'm']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60399 entries, 0 to 60453
Data columns (total 4 columns):
name             60399 non-null object
gender           60399 non-null int64
race             60399 non-null object
real_0/fake_1    60399 non-null int64
dtypes: int64(2), object(2)
memory usage: 2.3+ MB


In [0]:
combined_names_df.name = combined_names_df.name.str.lower()
combines_names_X = combined_names_df["name"]
combined_names_y = combined_names_df["real_0/fake_1"]

X_train, X_test, y_train, y_test = train_test_split(combines_names_X, combined_names_y, 
                                                    test_size=0.25, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train, y_train)
count_test = count_vectorizer.transform(X_test)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train, y_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

print(count_vectorizer.get_feature_names()[:10])
print(tfidf_vectorizer.get_feature_names()[:10])

(45299,) (45299,) (15100,) (15100,)
['02', '0hm', '0huithd2i', '0ia', '0knaree', '0mk', '0msiako', '0n', '0वन', '105']
['02', '0hm', '0huithd2i', '0ia', '0knaree', '0mk', '0msiako', '0n', '0वन', '105']


In [0]:
multiNB_model = MultinomialNB(alpha=0.1)
multiNB_model.fit(count_train, y_train)
count_preds = multiNB_model.predict(count_test)
print("Accuracy score by Count Vectorizer and Multinomial Naive Bayes: " + str(accuracy_score(y_test, count_preds)*100) + "%")

Accuracy score by Count Vectorizer and Multinomial Naive Bayes: 94.45033112582782%


In [0]:
multiNB_model_tfidf = MultinomialNB(alpha=0.1)
multiNB_model_tfidf.fit(tfidf_train, y_train)
tfidf_preds = multiNB_model_tfidf.predict(tfidf_test)
print("Accuracy score by TfIdf Vectorizer and Multinomial Naive Bayes: " + str(accuracy_score(y_test, tfidf_preds)*100) +"%")

Accuracy score by TfIdf Vectorizer and Multinomial Naive Bayes: 94.38410596026489%


In [0]:
dTree_count_model = DecisionTreeClassifier()
dTree_count_model.fit(count_train, y_train)
dTree_count_preds = dTree_count_model.predict(count_test)

print("Accuracy score by Count Vectorizer and Decision Tree Classifier: " + str(accuracy_score(y_test, dTree_count_preds)*100) +"%")

Accuracy score by Count Vectorizer and Decision Tree Classifier: 61.49006622516556%


In [0]:
dTree_tfidf_model = DecisionTreeClassifier()
dTree_tfidf_model.fit(tfidf_train, y_train)
dTree_tfidf_preds = dTree_tfidf_model.predict(tfidf_test)

print("Accuracy score by Tfidf Vectorizer and Decision Tree Classifier: " + str(accuracy_score(y_test, dTree_tfidf_preds)*100) +"%")

Accuracy score by Tfidf Vectorizer and Decision Tree Classifier: 61.50993377483444%


In [0]:
def is_fake_or_real(name, vectorizer = "tfidf", model="nb"):
    lower_name = copy.deepcopy(str.lower(name))
    name_series = pd.Series(lower_name)
    if vectorizer == "tfidf":
        transformed_name = tfidf_vectorizer.transform(name_series)
    else:
        transformed_name = count_vectorizer.transform(name_series)
        
    prediction = multiNB_model_tfidf.predict(transformed_name)
    if prediction[0] == 0:
        return "{} is valid\n".format(name)
    else:
        return "{} is invalid\n".format(name)

print(is_fake_or_real("gurpreet"))
print(is_fake_or_real("justin"))
print(is_fake_or_real("sjbfhsb"))
print(is_fake_or_real("raju"))
print(is_fake_or_real("adfv"))

gurpreet is valid

justin is valid

sjbfhsb is invalid

raju is valid

adfv is invalid



In [0]:
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords 
from tensorflow.contrib.tensorboard.plugins import projector


In [0]:
num_words = 20000
tokenizer = Tokenizer(num_words=num_words)
data_text = combined_names_df.name
tokenizer.fit_on_texts(X_train)

In [0]:
tokenizer.word_index

{'kumar': 1,
 'singh': 2,
 'devi': 3,
 'smt': 4,
 'a': 5,
 'kumari': 6,
 'o': 7,
 'pooja': 8,
 'sharma': 9,
 'ram': 10,
 's': 11,
 'i': 12,
 'r': 13,
 'lal': 14,
 'm': 15,
 'n': 16,
 'h': 17,
 'jyoti': 18,
 'd': 19,
 'mohd': 20,
 'deepak': 21,
 'sunita': 22,
 'u': 23,
 'rahul': 24,
 'e': 25,
 'rekha': 26,
 'yadav': 27,
 'poonam': 28,
 'mamta': 29,
 'sonu': 30,
 'bai': 31,
 'k': 32,
 'chand': 33,
 'neha': 34,
 'amit': 35,
 'suman': 36,
 'nisha': 37,
 'gupta': 38,
 'seema': 39,
 'laxmi': 40,
 'meena': 41,
 'khan': 42,
 'kaur': 43,
 'priyanka': 44,
 'sanjay': 45,
 'soni': 46,
 'anita': 47,
 'l': 48,
 'sunil': 49,
 'ravi': 50,
 'raj': 51,
 'raju': 52,
 'pinki': 53,
 'kajal': 54,
 'kavita': 55,
 'ajay': 56,
 'vijay': 57,
 'manju': 58,
 'rani': 59,
 't': 60,
 'preeti': 61,
 'rajesh': 62,
 'manish': 63,
 'mukesh': 64,
 'manoj': 65,
 'rohit': 66,
 'geeta': 67,
 'sandeep': 68,
 'aarti': 69,
 'aa': 70,
 'komal': 71,
 'anil': 72,
 'pal': 73,
 'santosh': 74,
 'ali': 75,
 'kiran': 76,
 'manisha': 7

In [0]:
x_train_tokens = tokenizer.texts_to_sequences(X_train)

In [0]:
X_train[0]

'jafauamrsrad h'

In [0]:
x_test_tokens = tokenizer.texts_to_sequences(X_test)

In [0]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

print("Mean tokens: " + str(np.mean(num_tokens)))
print("Max tokens: " + str(np.max(num_tokens)))

Mean tokens: 1.1543237470819054
Max tokens: 13


In [0]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
print("Max tokens: " + str(max_tokens))
print("Covers " + str((np.sum(num_tokens < max_tokens) / len(num_tokens))*100) + "% of dataset")

Max tokens: 3
Covers 94.60255964502724% of dataset


In [0]:
pad = 'pre'
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding=pad, truncating=pad)
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)

In [0]:
x_train_pad.shape, x_test_pad.shape

((45299, 3), (15100, 3))

In [0]:
OUTDIR = './Graph'

import shutil
shutil.rmtree(OUTDIR, ignore_errors = True)

model = Sequential()
embedding_size=8

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1 , activation="sigmoid"))

optimizer = Adam(lr=0.001)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 3, 8)              160000    
_________________________________________________________________
gru_3 (GRU)                  (None, 3, 16)             1200      
_________________________________________________________________
gru_4 (GRU)                  (None, 3, 8)              600       
_________________________________________________________________
gru_5 (GRU)                  (None, 4)                 156       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 5         
Total params: 161,961
Trainable params: 161,961
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorboardcolab import *

tbc = TensorBoardColab()

Using TensorFlow backend.


Wait for 8 seconds...
TensorBoard link:
https://1c49bf6e.ngrok.io


In [0]:
config = projector.ProjectorConfig()
projector.visualize_embeddings(tf.summary.FileWriter(OUTDIR), config)

In [0]:
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=10, batch_size=64)
# model.fit(x_train_pad, y_train,
#           validation_split=0.05, epochs=10, batch_size=64, 
#           callbacks=[TensorBoardColabCallback(tbc)])

Train on 43034 samples, validate on 2265 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2e5f0b80f0>

In [0]:
%%time
result = model.evaluate(x_test_pad, y_test)

CPU times: user 1.37 s, sys: 155 ms, total: 1.53 s
Wall time: 1.05 s


In [0]:
def predict_validity(texts):
    texts = [str.lower(text) for text in texts]
    tokens = tokenizer.texts_to_sequences(texts)
    tokens_pad = pad_sequences(tokens, maxlen=max_tokens,
                           padding=pad, truncating=pad)
    print(tokens_pad.shape)
    predictions = model.predict(tokens_pad)
    results = []
    i = 0
    for pred in predictions:
        if(pred <0.5):
            results.append("{} is Valid".format(texts[i]))
        else:
            results.append("{} is Invalid".format(texts[i]))
        i = i+1
    return results

text1 = "gurpreet"
text2 = "raju"
text3 = "ram"
text4 = "afkjafkj"
text5 = "Iadfadf"
text6 = "hello"
text7 = "saddam hussain"
text8 = "Md"
texts = [text1, text2, text3, text4, text5, text6, text7, text8]

predict_validity(texts)

(8, 3)


['gurpreet is Valid',
 'raju is Valid',
 'ram is Valid',
 'afkjafkj is Invalid',
 'iadfadf is Invalid',
 'hello is Invalid',
 'saddam hussain is Valid',
 'md is Valid']