# Built-In sequences
Python has 5 built-in sequence types:
* Strings : *Immutable*
* Tuples : *Immutable*
* Lists : *Mutable*
* Byte arrays : *Mutable*
* Bytes : *Immutable*

In [6]:
# python in built sequence datatypes
l = []
s = ''
t = ()
ba = bytearray(b'')
b = bytes([])
print(type(l))
print(type(s))
print(type(t))
print(type(ba))
print(type(b))    # only for python 3.x

<type 'list'>
<type 'str'>
<type 'tuple'>
<type 'bytearray'>
<type 'str'>


In [18]:
# Immutable objects in python
# there are collection data types (sets and dictionaries) which are accessed via immutables datatypes
# Concept of deep copying!
myList = {'people','tomey','dingdong'}
slayers = myList.copy()    # it creates a new copy
slayers1 = myList    # it creates a new reference
print(myList)
print(slayers)
print(slayers1)
slayers.discard('people')
slayers.remove('tomey')
slayers1.discard('dingdong')
print(myList)
print(slayers)
print(slayers1)

import copy
m = copy.copy(slayers)
n = copy.deepcopy(slayers)


set(['people', 'dingdong', 'tomey'])
set(['tomey', 'dingdong', 'people'])
set(['people', 'dingdong', 'tomey'])
set(['people', 'tomey'])
set(['dingdong'])
set(['people', 'tomey'])


# Python Strings
Every python object has two output forms :
* String form - readable by humans
* Representational form -  readable by python interpretor

In [21]:
# Unicode strings
# here space is represented by u0020 Unicode - 16 bits
# ASCII code takes 8 bits to represent

print(u'Yo\u0020Bro !')


Yo Bro !


In [26]:
# Methods for strings
# using join is better than using + to concatenate a list
list_of_names = ['Abhishek','Karunesh','Arpit']
print(' '.join(list_of_names))    # join method
print('-<>-'.join(list_of_names))
print('__|__'.join(reversed(list_of_names)))    # reverse method


Abhishek Karunesh Arpit
Abhishek-<>-Karunesh-<>-Arpit
Arpit__|__Karunesh__|__Abhishek


In [29]:
# rjust and ljust methods
name = 'Abhishek Shakya'
print(name.rjust(50,'-'))    # adding at the end
print(name.ljust(50,'-'))    # adding at the start


-----------------------------------Abhishek Shakya
Abhishek Shakya-----------------------------------


In [38]:
# format method
# strings can be formatted
print('{0}, {1}').format('I am the one','No I am not!')

# string mapping or unmapping
# the idea is to combine local() method with format resulting in a key-value list suitable for passing to a function
number = 999
string = 'abhishek'
print('Number {number} is a good {string}'.format(**locals()))


I am the one, No I am not!
Number 999 is a good abhishek


In [38]:
# splitlines function in python
random_string = 'abhishek shakya\nding dong\nhello hello\nsomebody in there'
print(random_string.splitlines())

# using split
print(random_string.split('\n',2))    #    split occured only 2 times
print(random_string.split(' '))
print(random_string.split(' '))

['abhishek shakya', 'ding dong', 'hello hello', 'somebody in there']
['abhishek shakya', 'ding dong', 'hello hello\nsomebody in there']
['abhishek', 'shakya\nding', 'dong\nhello', 'hello\nsomebody', 'in', 'there']
['abhishek', 'shakya\nding', 'dong\nhello', 'hello\nsomebody', 'in', 'there']


In [16]:
# it requires a special condition that number of words or separable entitites should be same
with open('random_text.txt','r') as txt:
    text = np.genfromtxt(txt,dtype ='string')    # this is good for reading data in columnar structure like logs
print(text)

[['Abhishek' 'shakya']
 ['pulkit' 'shakya']]


In [30]:
# using rsplit function
print(random_string.rsplit(' ',2))    # splitting from right

['abhishek shakya\nding dong\nhello hello\nsomebody', 'in', 'there']


In [37]:
# the strip chars method
sample_string = '\nabhishek\nshakya\n'
print(sample_string)
sample_string.strip('\n')    # removes the trailing spaces from end or front



abhishek
shakya



'abhishek\nshakya'

In [75]:
# reading text file
with open('sample_text.txt','r') as text_object:
    text = text_object.readlines()
print(text)

['Problem Statement\n', 'Welcome to Gardenia - A country which believes in creating a harmony between technology and natural resources. Over the years, Gardenia has come up with ways to utilise natural resources effectively and they have enabled this with use of cutting edge technology.\n', '\xc2\xa0\n', 'The country takes pride in the way it has maintained its natural resources and Gardens. Now, the government of Gardenia wants to use data science to understand the health habits of their citizens. So, they started to capture several variables from various parks in the country.\n', '\xc2\xa0\n', 'They now have this data over a long period of time and are looking for experts like yourself to look at the data and tell, if you can predict how many people will come to a park on a particular day, given the environmental information.\n', '\xc2\xa0\n', 'Go, help the mayor of Gardenia! He has high hopes from Analytics Vidhya community! All the best!\n', '\n']


In [79]:
# creating a tf-idf document using strip() function
import string
import sys
word_dict = {}
def count_unique_word(text):
    for line in text:
        words = line.lower().strip('\n').split(' ')
        for word in words:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] +=1
    print(list(word_dict))    # printing every key(word) of dictionary 
    # sorted(word_dict) will give keys sorted in ascending order
    for word in sorted(word_dict):
        print(word,'-->',word_dict[word])

if __name__=='__main__':
    count_unique_word(text)

['', 'all', 'help', 'over', 'its', 'information.', 'to', 'health', 'has', 'resources', 'resources.', 'government', 'gardens.', 'cutting', 'they', 'now', 'like', 'technology.', 'best!', 'country.', 'habits', 'people', 'analytics', 'are', 'for', 'ways', 'mayor', 'looking', 'day,', 'years,', 'between', 'various', 'creating', 'of', 'given', 'come', 'on', 'country', 'community!', 'variables', 'period', 'yourself', 'maintained', 'now,', 'use', 'from', 'takes', 'long', 'their', 'way', 'wants', 'statement', 'go,', 'started', 'park', 'so,', 'citizens.', 'environmental', 'understand', 'particular', 'hopes', 'with', 'he', 'pride', '\xc2\xa0', 'look', 'this', 'science', 'enabled', 'up', 'believes', 'will', 'can', 'many', 'problem', 'and', 'vidhya', 'predict', 'it', 'high', 'effectively', 'experts', 'at', 'have', 'in', 'technology', 'if', 'capture', 'gardenia!', '-', 'parks', 'how', 'harmony', 'which', 'you', 'several', 'tell,', 'utilise', 'gardenia', 'welcome', 'data', 'a', 'natural', 'edge', 'tim

In [101]:
import operator
sorted_word = sorted(word_dict.items(),key = operator.itemgetter(1), reverse = True)
for tup in sorted_word:
    print('{}: {}'.format(tup[0],tup[1]))

the: 10
to: 7
of: 5
and: 5
a: 5
has: 3
they: 3
 : 3
in: 3
gardenia: 3
data: 3
natural: 3
over: 2
resources: 2
come: 2
country: 2
use: 2
from: 2
with: 2
this: 2
have: 2
: 1
all: 1
help: 1
its: 1
information.: 1
health: 1
resources.: 1
government: 1
gardens.: 1
cutting: 1
now: 1
like: 1
technology.: 1
best!: 1
country.: 1
habits: 1
people: 1
analytics: 1
are: 1
for: 1
ways: 1
mayor: 1
looking: 1
day,: 1
years,: 1
between: 1
various: 1
creating: 1
given: 1
on: 1
community!: 1
variables: 1
period: 1
yourself: 1
maintained: 1
now,: 1
takes: 1
long: 1
their: 1
way: 1
wants: 1
statement: 1
go,: 1
started: 1
park: 1
so,: 1
citizens.: 1
environmental: 1
understand: 1
particular: 1
hopes: 1
he: 1
pride: 1
look: 1
science: 1
enabled: 1
up: 1
believes: 1
will: 1
can: 1
many: 1
problem: 1
vidhya: 1
predict: 1
it: 1
high: 1
effectively: 1
experts: 1
at: 1
technology: 1
if: 1
capture: 1
gardenia!: 1
-: 1
parks: 1
how: 1
harmony: 1
which: 1
you: 1
several: 1
tell,: 1
utilise: 1
welcome: 1
edge: 1
time

In [None]:
# earlier we found spaces and some punctuation marks interfering our counting process
# so we would use some advanced techniques
