In [1]:
import time
import re
import numpy as np
import pandas as pd
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [17]:
df_train1 = pd.read_csv('train.csv',lineterminator='\n')
df_test = pd.read_csv('test.csv',lineterminator='\n')
df_train = df_train1.loc[:, ['ID','review','label']]
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,Negative
1,2,ya Allah meri sister Affia ki madad farma,Positive
2,3,Yeh khud chahta a is umar main shadi krna. ha...,Negative
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,Negative
4,5,Good,Positive


In [18]:
df_train['label'] = df_train['label'].map(lambda x: 0 if x == 'Negative' else 1)
# df_train['label'].replace('Positive',1,inplace=True)
df_train.head()

Unnamed: 0,ID,review,label
0,1,Jo bhi ap se tou behtar hoon,0
1,2,ya Allah meri sister Affia ki madad farma,1
2,3,Yeh khud chahta a is umar main shadi krna. ha...,0
3,4,Tc ? Apky mun xe exe alfax achy nae lgty 😒💃,0
4,5,Good,1


In [19]:
df_test.head()

Unnamed: 0,ID,review
0,1,Yaqoob Memon Ki Phansi Zalimana Ghair Insani H...
1,2,Sabit qadam rehna
2,3,Good decision on ko bhi aam shehryun ki tarah ...
3,4,Jo Shakhs ALLAH Aur Qayamat Per Eman Rakhta Ho...
4,5,Tm log veriat kohli ko ahmad shahzad k sath co...


In [20]:
df_train['label'].value_counts()

1    3361
0    2967
Name: label, dtype: int64

In [21]:
df_train.isnull().sum()

ID        0
review    0
label     0
dtype: int64

In [22]:
df_test.isnull().sum()

ID        0
review    7
dtype: int64

In [23]:
numpy_array = df_train.as_matrix()
numpy_array_test = df_test.as_matrix()
numpy_array[:4]

array([[1, 'Jo bhi ap se tou behtar hoon', 0],
       [2, 'ya Allah meri sister Affia ki madad farma', 1],
       [3, 'Yeh khud chahta a is umar main shadi krna.  had ogi', 0],
       [4, 'Tc ? Apky mun xe exe alfax achy nae lgty 😒💃', 0]],
      dtype=object)

In [24]:
numpy_array_test[115]

array(['Is pr jab aap Code Dial Krty hain to aap ko 100 balance milta hai jo aap sirf call aur sms krny k lye use kr sakty hain aur is k sath 500mb internet b 1 mahiny k lye. Ye offer 1 mahiny baad khud bakhud activate ho jaye gi. Matlab 3 mahiny tak aap ko is offer sy faida milta rahy ga.',
       nan], dtype=object)

In [25]:
#two commom ways to clean data
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  word = re.sub(r'é', '', word)
  word = re.sub(r'§', '', word)
  word = re.sub(r'¦', '', word)
  word = re.sub(r'æ', '', word)
  word = re.sub(r'\d+', '', word)
  word = re.sub('(.*?)\d+(.*?)', '', word)
  return word.lower()
def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [26]:
X_test = numpy_array_test[:,1]
X_test

array(['Yaqoob Memon Ki Phansi Zalimana Ghair Insani Hai 20 Saal Qaid Kaat Chukay Thay Amnesty International Ki Bharti Iqdam Ki Muzammat ',
       ' Sabit qadam rehna',
       'Good decision on ko bhi aam shehryun ki tarah Huqooq hony chahye',
       ..., 'Lanat haramdyo', 'Is episode mein koi bhi hot nai hai',
       'Mery liye dua kry Allah pak naseeb achy kry ameen'], dtype=object)

In [27]:
#test if there are nan 
counter = 1
for sentence in X_test:
    try:
        words = sentence.split(' ')
        counter+=1
    except:
        print(sentence)
        print(counter)

nan
113
nan
113
nan
113
nan
113
nan
113
nan
113
nan
1950


In [28]:
X_train = numpy_array[:, 1]
# Clean X here
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
y_train = numpy_array[:, 2]
X_train[:5]

AttributeError: 'float' object has no attribute 'split'

In [None]:
print(len(X_train))
print(len(X_test))

In [None]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
y_train[:6]

In [None]:
ngram = 3
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

In [14]:
X_all = X_train + X_test # Combine both to fit the TFIDF vectorization.
lentrain = len(X_train)

vectorizer.fit(X_all) # This is the slow part!
X_all = vectorizer.transform(X_all)
print(X_all)

ValueError: operands could not be broadcast together with shapes (6328,) (2719,) 

In [15]:
vectorizer.get_feature_names()[-5:]

NameError: name 'vectorizer' is not defined

In [16]:
X_all.shape

NameError: name 'X_all' is not defined

In [21]:
X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. 
X_test_chuli = X_all[lentrain:]
print(X_train_chuli)

<6306x224684 sparse matrix of type '<class 'numpy.float64'>'
	with 256086 stored elements in Compressed Sparse Row format>

In [76]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.linear_model import LogisticRegression as LG

In [77]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])

In [78]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = X_train_chuli[trn_idx]
    trn_label= y_train[trn_idx]
    val_data = X_train_chuli[val_idx]
    val_label= y_train[val_idx]
    model_SGD = NB()                      
    model_SGD.fit(trn_data, trn_label) # Fit the model.
    print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:,1])))
    predictions += model_SGD.predict_proba(X_test_chuli)[:,1] / folds.n_splits

Fold :1
auc score: 0.87730 
Fold :2
auc score: 0.84500 
Fold :3
auc score: 0.85388 
Fold :4
auc score: 0.86265 
Fold :5
auc score: 0.85859 
Fold :6
auc score: 0.83793 
Fold :7
auc score: 0.87822 
Fold :8
auc score: 0.83750 
Fold :9
auc score: 0.84872 
Fold :10
auc score: 0.87115 


In [30]:
print(len(predictions))
predictions[:4]

2712


array([1.36800743, 1.15362139, 0.71803962, 1.42031524])

In [31]:
SGD_output = pd.DataFrame({"ID":df_test["ID"], "Pred":predictions})
SGD_output.to_csv('SGD_new.csv', index = False)