In [None]:
import numpy as np
import pandas as pd
import feather

In [None]:
df = pd.read_feather('id_text_dialect.feather')
df.head()

In [None]:
df.shape

In [None]:
df.dialect.nunique()

In [None]:
df.dialect.unique()

In [None]:
df['dialect'].value_counts()

In [None]:
df.id.duplicated().sum()

In [None]:
df.isnull().sum()

In [None]:
df1 = df[['text', 'dialect']
df1.head()

In [None]:
import re
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
def clean(line, tag = False):
    #Remove links, hashtags, at-mentions, mark-up, and "RT"
    line = re.sub(r"http\S+", "", line)
    line = re.sub(r"@\S+", "", line)
    line = re.sub(r"#\S+", "", line)
    line = re.sub("<[^>]*>", "", line)
    line = line.replace(" RT", "").replace("RT ", "")
    line = re.sub(emoji_pattern, "", line)

    return line

In [None]:
df1.iloc[3].text

In [None]:
clean(df1.iloc[3].text)

In [None]:
df1['text'] = df1['text'].map(clean)

In [None]:
df1.head()

In [None]:
import sklearn 
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import random

In [None]:
X = df1.text
y = df1.dialect

In [None]:
from sklearn.model_selection import train_test_split
SEED = 2000
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))

In [None]:
vectorizer.fit(X_train)

In [None]:
x_train = vectorizer.transform(X_train)
x_test = vectorizer.transform(X_test).toarray()

In [None]:
seed = 7
np.random.seed(seed)
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield np.array(X_batch),np.array(y_batch)
        if (counter > number_of_batches):
            counter=0
            




In [None]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=100000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit_generator(generator=batch_generator(x_train, y_train, 32),
                    epochs=5,
                    steps_per_epoch=tf.strings.to_number(x_train.shape[0], out_type=tf.float32)/32)


In [None]:
print(classification_report(y_test, y_train))

In [None]:
accuracy_score(y_test, y_train)