In [1]:
# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()

In [2]:
# Use Azure Machine Learning history magic to control history collection
# History is off by default, options are "on", "off", or "show"
# %azureml history on

In [3]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile
import urllib.request
from tempfile import mktemp

In [4]:
base_path=r'C:\Users\ds1\Documents\AzureML'
base_folder='data'

# URL to download the sentiment140 dataset
data_url='http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip'

In [5]:
# Functions to download and process data

def change_base_dir(base_dir_path):
    """ Change the working directopry of the code"""
    
    if not os.path.exists(base_dir_path):
        print ('creating directory', base_dir_path)
        os.makedirs(base_dir_path)
    print ('Changing base directory to ', base_dir_path)
    os.chdir(base_dir_path)

def download_data(download_url, filename='downloaded_data.zip'):
    """ Download and extract data """
    
    downloaded_filename = os.path.join('.', filename)
    print ('Step 1: Downloading data')
    urllib.request.urlretrieve(download_url,downloaded_filename)
    print ('Step 2: Extracting data')
    zipfile=ZipFile(downloaded_filename)
    zipfile.extractall('./')
    zipfile.close()

def extract_tweets_and_labels(filename ):
    """ Extract tweets and labels from the downloaded data"""
    
    print ('Step 3: Reading the data as a dataframe')
    df=pd.read_csv(filename, header=None, encoding='iso-8859-1')    
    df.columns=['Label','TweetId','Date','Query','User','Text']
    print ('Read {} lines'.format(df.shape[0]))
    print ('Discarding neutral tweets')
    df=df[df.Label!=2]
    print ('No of lines in the data after filtering neutral tweets: {}'.format(df.shape[0]))
    print ('Step 4: Shuffling the data')
    train_length=int(df.shape[0]*0.8)    
    df=df.sample(frac=1) # reshuffling the data
      
    df['Text']=df['Text'].astype(str).apply(lambda x:x.strip())#.encode('ascii','ignore')#str.decode('utf8','ignore')#.str.encode('ascii','ignore')
    print (df.head())
    print ('Step 5: Dividing into test and train datasets')
    df_train = df.iloc[:train_length, :]
    df_test = df.iloc[train_length:, :]    
    
    print ('Step 6: Exporting the train and test datasets')    
    print ('Exporting training data of rows {}'.format(df_train.shape[0]))
    export_prefix='training'
    df_train[['Label']].to_csv(export_prefix+'_label.csv', header=False, index=False)
    df_train[['Text']].to_csv(export_prefix+'_text.csv', header=False, index=False)
    print ('Target distribution in the training data is as follows')
    print ('\n',df_train['Label'].value_counts()) 
    
    print ('Exporting training data of rows {}'.format(df_test.shape[0]))
    export_prefix='testing'
    df_test[['Label']].to_csv(export_prefix+'_label.csv', header=False, index=False)
    df_test[['Text']].to_csv(export_prefix+'_text.csv', header=False, index=False)
    print ('Target distribution in the testing data is as follows')
    print ('\n',df_test['Label'].value_counts())

In [6]:
# Download and processing the data

base_dir_path=base_path+'\\'+base_folder
change_base_dir(base_dir_path)
download_data(data_url)
extract_tweets_and_labels('training.1600000.processed.noemoticon.csv')

Changing base directory to  C:\Users\ds1\Documents\AzureML\data
Step 1: Downloading data
Step 2: Extracting data
Step 3: Reading the data as a dataframe
Read 1600000 lines
Discarding neutral tweets
No of lines in the data after filtering neutral tweets: 1600000
Step 4: Shuffling the data
         Label     TweetId                          Date     Query  \
583617       0  2215002587  Wed Jun 17 17:32:06 PDT 2009  NO_QUERY   
1530030      4  2177699730  Mon Jun 15 06:33:19 PDT 2009  NO_QUERY   
127209       0  1834675142  Mon May 18 03:49:23 PDT 2009  NO_QUERY   
19557        0  1556879231  Sun Apr 19 00:48:15 PDT 2009  NO_QUERY   
353007       0  2031663185  Thu Jun 04 10:08:26 PDT 2009  NO_QUERY   

                User                                               Text  
583617    dwaitsbaby                                         work sucks  
1530030    minxkitty           @K8loulee hello hun! Nice to 'meet' you!  
127209    TferThomas  @melissaox Ah I understand now. I will admit t