# Notebook Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Constants

In [4]:
VOCAB_SIZE = 2500

# READ and LOAD Features from .txt Files into NumPy Array

In [5]:
sparse_train_data = np.loadtxt('trainning_data.txt', delimiter=' ', dtype=str)
sparse_test_data = np.loadtxt('test_data.txt', delimiter=' ', dtype=str)

In [6]:
print('num rows in trainning file', sparse_train_data.shape[0])
print('num rows in test file', sparse_test_data.shape[0])

num rows in trainning file 58047
num rows in test file 24594


In [7]:
print('num tweets in trainning file', np.unique(sparse_train_data[:, 0]).size)
print('num tweets in trainning file', np.unique(sparse_test_data[:, 0]).size)

num tweets in trainning file 10204
num tweets in trainning file 4373


In [8]:
sparse_train_data

array([['1', '171', 'Spam', '1'],
       ['1', '908', 'Spam', '1'],
       ['1', '1245', 'Spam', '1'],
       ...,
       ['14898', '597', 'Spam', '1'],
       ['14898', '876', 'Spam', '1'],
       ['14898', '1128', 'Spam', '1']], dtype='<U7')

# Create a Full Matrix from a Sparse Matrix

In [9]:
def make_full_matrix(sparse_matrix, num_words, tweet_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    '''
    Form a full matrix from a sparse matrix.
    Return pandas dataframe.
    
    Keyword arguments:
    sparse_matrix -- numpy array
    num_words -- size of vocabulary. Total number of tokens.
    TWEET_IDx -- position of the document id in the sparse matrix
    word_idx -- position of the word id in the sparse matrix
    cat_idx -- position of the label in the sparse matrix
    freq_idx -- position of the frequency in the sparse matrix
    '''
    column_names = ['CATEGORY'] + list(range(num_words))
    tweet_id_names = np.unique(sparse_matrix[:, 0]).astype(int)
    full_matrix = pd.DataFrame(index=tweet_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    # this line needs fixing
    full_matrix['CATEGORY'] = full_matrix['CATEGORY'].astype(str)
    
    for i in range(sparse_matrix.shape[0]):
        # parsing matrix
        tweet_num = int(sparse_matrix[i][tweet_idx])
        word_id = int(sparse_matrix[i][word_idx])
        label = sparse_matrix[i][cat_idx]
        occurrence = int(sparse_matrix[i][freq_idx])
        
        # populate return matrix
        full_matrix.at[tweet_num, 'CATEGORY'] = label
        full_matrix.at[tweet_num, word_id] = occurrence
        
        #print('stats', tweet_num, word_id, label, occurrence)
    return full_matrix

In [10]:
%%time
full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

CPU times: user 5.76 s, sys: 189 ms, total: 5.95 s
Wall time: 5.9 s


In [11]:
full_train_data.shape

(10204, 2501)

In [12]:
full_train_data

Unnamed: 0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
1,Spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,Spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,Spam,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,Quality,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000,Quality,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,Quality,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9994,Quality,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9995,Quality,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,Spam,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
%%time
full_test_data = make_full_matrix(sparse_test_data, VOCAB_SIZE)

CPU times: user 2.58 s, sys: 80.3 ms, total: 2.66 s
Wall time: 2.62 s


In [14]:
full_test_data

Unnamed: 0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,Quality,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10003,Spam,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10008,Spam,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
10009,Spam,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10017,Quality,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,Spam,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9988,Spam,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9992,Spam,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,Spam,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
