In [2]:
%load_ext cuml.accel

In [2]:
import cudf as pd
import cupy as np
from cuml.model_selection import train_test_split
from cuml.feature_extraction.text import CountVectorizer
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.linear_model import LogisticRegression
from cuml.metrics import accuracy_score

In [4]:
dataset = pd.read_csv('/home/zzeno/my-dataset.csv')

In [5]:
dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = col_names

In [10]:
dataset.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [12]:
dataset.shape

(1599999, 6)

In [13]:
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [14]:
dataset['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [15]:
dataset['target'] = dataset['target'].map({4:1, 0:0})

In [16]:
dataset['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

In [17]:
print("Cleaning text on GPU...")
processed_text = dataset['text'].str.lower()
processed_text = processed_text.str.replace(r'http\S+', '', regex=True)
processed_text = processed_text.str.replace(r'@\w+', '', regex=True)
processed_text = processed_text.str.replace(r'^rt\s+', '', regex=True)
processed_text = processed_text.str.replace(r'#', '', regex=False)
processed_text = processed_text.str.replace(r'[^a-z\s]', '', regex=True)
processed_text = processed_text.str.replace(r'\s+', ' ', regex=True)
processed_text = processed_text.str.strip()
dataset['text'] = processed_text
print('Text cleaning complete.')

Cleaning text on GPU...
Text cleaning complete.


In [18]:
dataset['text'].head()

0    is upset that he cant update his facebook by t...
1    i dived many times for the ball managed to sav...
2       my whole body feels itchy and like its on fire
3    no its not behaving at all im mad why am i her...
4                                   not the whole crew
Name: text, dtype: object

In [19]:
print('Splitting dataframe...')
df_clean = dataset[['text', 'target']]

Splitting dataframe...


In [20]:
valid_rows = df_clean['text'].str.len() > 0
df_clean = df_clean[valid_rows]

In [21]:
df_clean.shape

(1596341, 2)

In [22]:
train_df, test_df = train_test_split(
    df_clean,
    test_size=0.2,
    random_state=100,
    stratify=df_clean['target']
)

print(train_df.shape)
print(test_df.shape)

(1277073, 2)
(319268, 2)


In [23]:
X_train = train_df['text']
y_train = train_df['target']

X_test = test_df['text']
y_test = test_df['target']

In [24]:
print('Starting Vectorizing on GPU...')
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000, ngram_range=(1, 2), min_df=5, max_df=0.5)

Starting Vectorizing on GPU...


In [25]:
X_train_vector = vectorizer.fit_transform(X_train)

In [26]:
X_test_vector = vectorizer.transform(X_test)

In [27]:
print('Vectorization complete.')

Vectorization complete.


In [38]:
print(X_train_vector.shape)
print(X_test_vector.shape)

(1277073, 50000)
(319268, 50000)


In [29]:
print('Initizaling the Model...')

lr = LogisticRegression()

print('Model Initialized.')

Initizaling the Model...
Model Initialized.


In [30]:
print('Training the model...')
lr.fit(X_train_vector, y_train)
print('Training Complete.')

Training the model...
Training Complete.


In [31]:
y_pred_lr = lr.predict(X_test_vector)

In [32]:
print('Accuracy Score: ',accuracy_score(y_test, y_pred_lr))

Accuracy Score:  0.7837897941541275


In [33]:
from cuml.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0)

In [34]:
print('Training model...')
nb.fit(X_train_vector, y_train)
print('Training Complete.')

Training model...
Training Complete.


In [35]:
y_pred_nb = nb.predict(X_test_vector)
print('Accuracy Score: ',accuracy_score(y_test, y_pred_nb))

Accuracy Score:  0.7679754939423932


In [36]:
from cuml.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vector, y_train)
y_pred_knn = knn.predict(X_test_vector)
print(accuracy_score(y_test, y_pred_knn))

0.631992557976371


In [71]:
from cuml.metrics import confusion_matrix

In [72]:
print('Confusion Matrix for LogisticRegression: \n')
confusion_matrix(y_test, y_pred_lr)

Confusion Matrix for LogisticRegression: 



array([[122032,  37633],
       [ 30644, 128959]])

In [73]:
print('Confusion Matrix for Naive-Bayes: \n')
confusion_matrix(y_test, y_pred_nb)

Confusion Matrix for Naive-Bayes: 



array([[122778,      0],
       [ 36785,      0]])

In [74]:
print('Confusion Matrix for KNN:\n')
confusion_matrix(y_test, y_pred_knn)

Confusion Matrix for KNN:



array([[ 63712,  95953],
       [ 21539, 138064]])