In [3]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('poster')

In [4]:
import csv

In [5]:
def select_subset_of_columns(file_to_read, file_to_write, columns):
    '''
    Takes an input csv file and a list of column names from that file's header and writes
    a new csv file to disk containing only the specified columns.
    
    params: 
        file_to_read (string) (file.csv) path to some existing csv file to read
        file_to_write (string) (file.csv) path to csv file to write as output
        columns (list) list of columns in header of file_to_read to keep.
        
    returns:
        no returns
        creates and output file that is a subset of the input file with just the specified columns retained.
    '''
    with open(file_to_read, 'r') as file_in:

        csv_in = csv.DictReader(file_in)

        header = next(csv_in)
        
        for col in columns:
            if col not in header:
                print(f'no column by the name {col}')
                return
            
        with open(file_to_write, 'w') as file_out:

            csv_out = csv.writer(file_out)
            csv_out.writerow(columns)
        
            for line in csv_in:
                col_list = []
                for col in columns:
                    col_list.append(line[col])
                csv_out.writerow(col_list)
        
        print(f'A subset of {file_to_read} named {file_to_write} has been created with the columns {columns} retained.')
            

In [6]:
select_subset_of_columns('test_sample.csv', 'test_sample_brief.csv', ['countryLabel', 'continent', 'city'])

A subset of test_sample.csv named test_sample_brief.csv has been created with the columns ['countryLabel', 'continent', 'city'] retained.


In [7]:
df = pd.read_csv('test_sample_brief.csv')
df

Unnamed: 0,countryLabel,continent,city
0,Ecuador,Americas,Guayaquil
1,Turkey,Asia,Antalya
2,United States,Americas,Palm Harbor
3,Brazil,Americas,São Paulo
4,United Kingdom,Europe,Hayes
...,...,...,...
194,Spain,Europe,Premiá De Mar
195,China,Asia,Tong
196,United States,Americas,Lafayette
197,Greece,Europe,Athens


In [32]:
import random
def select_random_rows(file_to_read, file_to_write, num_rows):
    '''
    Takes an input csv file and writes a smaller csv file containing num_rows randomly seleceted from 
    the input csv file
    
    params: 
        file_to_read (string) (file.csv) path to some existing csv file to read
        file_to_write (string) (file.csv) path to csv file to write as output
        num_rows (list) number of rows to randomly select
        
    returns:
        rows_selected (list) indicies of the rows included in the new file

    '''
    with open(file_to_read, 'r') as file_in:

        csv_in = csv.reader(file_in)

        header = next(csv_in)
        
            
        with open(file_to_write, 'w') as file_out:

            csv_out = csv.writer(file_out)
            csv_out.writerow(header)
            
            length = find_csv_length(file_to_read)
        
            row_list = random.sample(range(length), num_rows)
                         
            for i, line in enumerate(csv_in):
                if i in row_list:
                    csv_out.writerow(line)
                    
                        
    print(f'A file named {file_to_write} has been created with {num_rows} rows.')
    
    return row_list       

In [None]:
import random
random.randint(a, b) # Return a random integer N such that a <= N <= b. Alias for randrange(a, b+1)
random.random()  # Return the next random floating point number in the range [0.0, 1.0)
random.uniform(a, b) # Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.
random.expovariate(lambd) # Exponential distribution. lambd is 1.0 divided by the desired mean. It should be nonzero.
random.gauss(mu, sigma)# Gaussian distribution. mu is the mean, and sigma is the standard deviation.
random.normalvariate(mu, sigma) # Normal distribution. mu is the mean, and sigma is the standard deviation. like gauss
random.randrange(start, stop[, step]) # Return a randomly selected element from range(start, stop, step).
# This is equivalent to choice(range(start, stop, step)), but doesn’t actually build a range object.
random.choice(seq) # Return a random element from the non-empty sequence seq. If seq is empty, raises IndexError.
random.choices(population, weights=None, *, cum_weights=None, k=1)
# Return a k sized list of elements chosen from the population with replacement.
# Example,  Six roulette wheel spins (weighted sampling with replacement)
choices(['red', 'black', 'green'], [18, 18, 2], k=6)
['red', 'green', 'black', 'black', 'red', 'black']
random.shuffle(x) # Shuffle the sequence x in place
random.sample(population, k)
# example random.sample(range(10000000), k=60).
random.random()  # Return the next random floating point number in the range [0.0, 1.0)
random.uniform(a, b) # Return a random floating point number N such that a <= N <= b for a <= b and b <= N <= a for b < a.
random.expovariate(lambd) # Exponential distribution. lambd is 1.0 divided by the desired mean. It should be nonzero.
random.gauss(mu, sigma)# Gaussian distribution. mu is the mean, and sigma is the standard deviation.
random.normalvariate(mu, sigma) # Normal distribution. mu is the mean, and sigma is the standard deviation.

In [33]:
def find_csv_length(file_to_read):
    '''
    Takes an input csv file and counts its rows skipping the header
    '''
    with open(file_to_read, 'r') as file_in:

        csv_in = csv.reader(file_in)

        next(csv_in)
        
        i = 0
        for line in csv_in:
            i+=1
        
    return i
            

In [34]:
a = find_csv_length('test_sample.csv')
a

200

In [35]:
a = select_random_rows('test_sample.csv', 'test_sample_10.csv', 10)

A file named test_sample_10.csv has been created with 10 rows.


In [36]:
a

[101, 111, 76, 181, 138, 39, 163, 125, 13, 180]

In [37]:
df = pd.read_csv('test_sample_10.csv')

In [38]:
df

Unnamed: 0,course_id,user_id,registered,viewed,explored,certified,completed,ip,cc_by_ip,countryLabel,continent,city,region,subdivision,postalCode,un_major_region,un_economic_group,un_developing_nation,un_special_region,latitude,longitude,LoE,YoB,gender,grade,passing_grade,start_time,first_event,last_event,nevents,ndays_act,nplay_video,nchapters,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,roles,nprogcheck,nproblem_check,nforum_events,mode,is_active,cert_created_date,cert_modified_date,cert_status,verified_enroll_time,verified_unenroll_time,profile_country,y1_anomalous,email_domain,language_brwsr,language_brwsr_country,language_brwsr_sec,language_brwsr_sec_country,language_brwsr_code,language_brwsr_subcode,language_brwsr_sec_code,language_brwsr_sec_subcode,language_brwsr_nevents,language_brwsr_ndiff,language,language_download,language_nevents,language_ndiff,ntranscript,nshow_answer,nvideo,nvideos_unique_viewed,nvideos_total_watched,nseq_goto,nseek_video,npause_video,avg_dt,sdv_dt,max_dt,n_dt,sum_dt,roles_isBetaTester,roles_isInstructor,roles_isStaff,roles_isCCX,roles_isFinance,roles_isLibrary,roles_isSales,forumRoles_isAdmin,forumRoles_isCommunityTA,forumRoles_isModerator,forumRoles_isStudent
0,HarvardX/PH525.1x/1T2018,1243920,True,False,,False,False,202.133.49.62,IN,India,Asia,Tirupati,AP,Andhra Pradesh,517501,Southern Asia,Developing_Nations,,,13.65,79.4167,hs,1993.0,m,,0.7,2018-04-04 03:40:39,2018-04-04 03:40:39.527282,2018-04-04 03:51:18.702240,6,2,0,,,,,,,,Student,0,0,0,audit,1,,,,,,,,gmail.com,English,United States,English,India,en,US,en,IN,1,1,,,,,0,0,0,,,0,0,0,2.691979,,2.691979,2,5.383958,,,,,,,,,,,1
1,HarvardX/PH525.1x/1T2018,6990165,True,True,False,False,False,45.252.54.62,,,,,,,,,,,,,,a,1993.0,m,,0.7,2018-02-15 19:33:14,2018-02-15 00:00:00,2018-02-15 19:34:01.421516,13,1,2,1.0,,,,,,,Student,0,0,0,audit,1,,,,,,BD,,gmail.com,English,United States,,,en,US,,,8,1,en,0.0,1.0,1.0,0,0,2,1.0,0.027027,0,1,0,3.912736,4.506202,16.061624,12,46.952833,,,,,,,,,,,1
2,HarvardX/PH525.1x/1T2018,12478090,True,False,,False,False,82.47.144.24,GB,United Kingdom,Europe,Preston,LAN,Lancashire,PR4,Northern Europe,Developed regions,,,53.7809,-2.83,hs,1996.0,m,,0.7,2018-01-16 18:41:56,2018-01-16 18:41:56.107807,2018-03-13 11:00:14.773601,9,4,0,,,,,,,,Student,0,0,0,audit,0,,,,,,GB,,outlook.com,English,United Kingdom,,,en,GB,,,2,1,,,,,0,0,0,,,0,0,0,22.129156,32.296779,78.023717,5,110.645781,,,,,,,,,,,1
3,HarvardX/PH525.1x/1T2018,14896945,True,True,False,False,False,174.135.47.228,US,United States,Americas,Bakersfield,CA,California,93309,Northern America,Developed regions,,,35.3456,-119.0756,b,1985.0,m,,0.7,2018-02-16 15:51:36,2018-02-16 15:51:35.702516,2018-02-23 20:04:16.624406,9,2,0,1.0,,,,,,,Student,0,0,0,audit,1,,,,,,US,,ymail.com,English,United States,,,en,US,,,4,1,,,,,0,0,0,,,0,0,0,16.181408,30.711814,82.715007,7,162.850395,,,,,,,,,,,1
4,HarvardX/PH525.1x/1T2018,16203086,True,False,,False,False,177.226.96.124,MX,Mexico,Americas,León,GUA,Guanajuato,,Central America,Developing_Nations,,Latin America and the Caribbean,21.0931,-101.645,b,1991.0,m,,0.7,2018-01-31 00:28:48,2018-01-31 00:28:48.527927,2018-02-07 19:09:52.722066,5,2,0,,,,,,,,Student,0,0,0,audit,1,,,,,,MX,,outlook.com,Spanish; Castilian,,English,,es,419,en,,2,1,,,,,0,0,0,,,0,0,0,14.002356,4.191757,22.631934,3,33.377492,,,,,,,,,,,1
5,HarvardX/PH525.1x/1T2018,17407791,True,True,False,False,False,131.152.125.76,CH,Switzerland,Europe,Basel,BS,Basel-City,,Western Europe,Developed regions,,,47.5584,7.5733,,,,,0.7,2018-02-05 15:29:54,2018-02-05 00:00:00,2018-02-05 16:07:36.538278,85,1,1,1.0,,,,,,,Student,0,29,0,audit,1,,,,,,CH,,gmail.com,German,Germany,English,,de,DE,en,,48,1,en,0.0,3.0,1.0,0,0,1,1.0,0.027027,0,2,1,10.27534,28.629549,197.082485,81,832.302546,,,,,,,,,,,1
6,HarvardX/PH525.1x/1T2018,17649742,True,True,False,False,False,81.166.31.84,NO,Norway,Europe,Stavanger,11,Rogaland Fylke,4009,Northern Europe,Developed regions,,,58.9667,5.75,,,,,0.7,2018-02-16 17:39:44,2018-02-16 17:39:44.110624,2018-02-16 18:09:26.543409,19,1,0,1.0,,,,,,,Student,0,0,0,audit,1,,,,,,NO,,gmail.com,English,United States,"Bokmål, Norwegian; Norwegian Bokmål",,en,US,nb,,5,1,,,,,0,0,0,,,0,0,0,10.34702,31.185733,126.288024,16,165.552325,,,,,,,,,,,1
7,HarvardX/PH525.1x/1T2018,18169435,True,False,,False,False,184.69.122.58,CA,Canada,Americas,Nanaimo,BC,British Columbia,V9S,Northern America,Developed regions,,,49.174,-123.9422,b,1987.0,m,,0.7,2018-02-22 20:45:44,2018-02-22 20:45:43.987897,2018-02-22 20:46:49.420191,3,1,0,,,,,,,,Student,0,0,0,audit,1,,,,,,CA,,bccf.com,English,Canada,,,en,CA,,,1,1,,,,,0,0,0,,,0,0,0,32.716147,42.945106,63.082923,2,65.432294,,,,,,,,,,,1
8,HarvardX/PH525.1x/1T2018,18683165,True,True,False,False,False,128.6.36.179,US,United States,Americas,Piscataway,NJ,New Jersey,08854,Northern America,Developed regions,,,40.4993,-74.399,b,1990.0,m,,0.7,2018-04-02 23:43:18,2018-04-02 23:43:18.242233,2018-05-19 20:14:56.938034,59,7,0,1.0,,,,,,,Student,0,0,0,audit,1,,,,,,US,,u.rochester.edu,English,United States,,,en,US,,,25,1,en,0.0,2.0,1.0,0,0,0,,,0,0,0,3.210816,6.355495,25.328271,50,190.484014,,,,,,,,,,,1
9,HarvardX/PH525.1x/1T2018,18741487,True,False,,False,False,72.211.119.183,US,United States,North America,,,,,Northern America,Developed regions,,,38.0,-97.0,p,1994.0,f,,0.7,2018-04-04 16:13:58,2018-04-04 16:13:57.977293,2018-04-04 16:14:27.275433,8,2,0,,,,,,,,Student,0,0,0,audit,1,,,,,,US,,gmail.com,English,United States,,,en,US,,,1,1,,,,,0,0,0,,,0,0,0,9.766047,8.256521,19.13246,6,58.59628,,,,,,,,,,,1
