# Notebook Imports

In [22]:
import pandas as pd
import numpy as np

# Constants

In [23]:
VOCAB_SIZE = 2500

# READ and LOAD Features from .txt Filse into NumPy Array

In [24]:
sparse_train_data = np.loadtxt('trainning_data.txt', delimiter=' ', dtype=str)
sparse_test_data = np.loadtxt('test_data.txt', delimiter=' ', dtype=str)

In [25]:
print('num rows in trainning file', sparse_train_data.shape[0])
print('num rows in test file', sparse_test_data.shape[0])

num rows in trainning file 58047
num rows in test file 24594


In [26]:
print('num tweets in trainning file', np.unique(sparse_train_data[:, 0]).size)
print('num tweets in trainning file', np.unique(sparse_test_data[:, 0]).size)

num tweets in trainning file 10204
num tweets in trainning file 4373


In [74]:
sparse_train_data

array([['1', '171', 'Spam', '1'],
       ['1', '908', 'Spam', '1'],
       ['1', '1245', 'Spam', '1'],
       ...,
       ['14898', '597', 'Spam', '1'],
       ['14898', '876', 'Spam', '1'],
       ['14898', '1128', 'Spam', '1']], dtype='<U7')

# How to create an empty DataFrame

In [84]:
column_names = ['CATEGORY'] + list(range(0, VOCAB_SIZE))
len(column_names)

2501

In [128]:
sparse_test_data[:, 0]

array(['0', '0', '0', ..., '14892', '14897', '14897'], dtype='<U7')

In [85]:
index_names = np.unique(sparse_train_data[:, 0])
index_names

array(['1', '10', '100', ..., '9995', '9996', '9998'], dtype='<U7')

In [101]:
test_data = pd.DataFrame(index=index_names, columns=column_names)
test_data.fillna(value=0, inplace=True)
test_data['CATEGORY'] = test_data['CATEGORY'].astype(str)
test_data['CATEGORY'] = 'test'

dtype('int64')

In [104]:
test_data.head()

Unnamed: 0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
1,test,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,test,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,test,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,test,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10000,test,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
test_data[171].dtypes

dtype('int64')

# Create a Full Matrix from a Sparse Matrix

In [95]:
def make_full_matrix(sparse_matrix, num_words, tweet_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    '''
    Form a full matrix from a sparse matrix.
    Return pandas dataframe.
    
    Keyword arguments:
    sparse_matrix -- numpy array
    num_words -- size of vocabulary. Total number of tokens.
    TWEET_IDx -- position of the document id in the sparse matrix
    word_idx -- position of the word id in the sparse matrix
    cat_idx -- position of the label in the sparse matrix
    freq_idx -- position of the frequency in the sparse matrix
    '''
    column_names = ['CATEGORY'] + list(range(num_words))
    tweet_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=tweet_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    full_matrix.head()
    
    # this line needs fixing
    full_matrix['CATEGORY'] = full_matrix['CATEGORY'].astype(str)
    
    for i in range(sparse_matrix.shape[0]):
        tweet_num = sparse_matrix[i][tweet_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        print('stats', tweet_num, word_id, label, occurrence)
        
        #full_matrix.at[tweet_num, 'TWEET_ID'] = tweet_num
        full_matrix.at[tweet_num, 'CATEGORY'] = label
        full_matrix.at[tweet_num, word_id] = occurrence
        
        break
    
    #full_matrix.set_index('TWEET_ID', inplace=True)
    print(full_matrix.head())
    
    return full_matrix

In [96]:
%%time

full_train_data = make_full_matrix(sparse_train_data, VOCAB_SIZE)

stats 1 171 Spam 1
      CATEGORY  0  1  2  3  4  5  6  7  8  ...  2491  2492  2493  2494  2495  \
1         Spam  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
10           0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
100          0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
1000         0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   
10000        0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0   

       2496  2497  2498  2499  171  
1         0     0     0     0    1  
10        0     0     0     0  NaN  
100       0     0     0     0  NaN  
1000      0     0     0     0  NaN  
10000     0     0     0     0  NaN  

[5 rows x 2502 columns]
CPU times: user 4.9 s, sys: 126 ms, total: 5.03 s
Wall time: 4.96 s


  self.obj[key] = infer_fill_value(value)


In [114]:
full_train_data.shape
full_train_data.head()

Unnamed: 0,CATEGORY,0,1,2,3,4,5,6,7,8,...,2491,2492,2493,2494,2495,2496,2497,2498,2499,171
1,Spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [126]:
full_train_data.loc['1']

CATEGORY    Spam
0              0
1              0
2              0
3              0
            ... 
2496           0
2497           0
2498           0
2499           0
171            1
Name: 1, Length: 2502, dtype: object