# Basic Data Types and Operators

### Comments, Variables, and `print()`

In [None]:
# This is a comment as it starts with #

'''
This is a
multiple-line
comment
'''

"""
This is another
multiple-line
comment
"""

# Python is a strongly-typed language, so the type matters when performing operations.
# On the other hand, Python is a dynamically-typed language, so the variable type
# is determined based on the data it holds in the runtime

# python has a function called "print()" that prints variables based on their types

# Integer variable
x = 3
print(x)

# Float variable
y = 3.5
print(y)

# print() can print multiple objects, and it takes a parameter called "sep" that is by default a space
print(x, y)

# in the next line, we set the "sep" parameter as a new line "\n"
print(x, y, sep="\n")


3
3.5
3 3.5
3
3.5


### Function `type()`
you can use `type()` function to determine the type of the variable

In [None]:
print(type(x), type(y))

<class 'int'> <class 'float'>


### Strings

In [None]:
# strings - both single or double quotes can be used to define strings
z = "String with double quotes"
w = 'String with single quotes'
a = "String with 'single quotes' in it"
b = 'String with "double quotes" in it'
print(z, w, a, b, sep='\n')
print (type(z))

String with double quotes
String with single quotes
String with 'single quotes' in it
String with "double quotes" in it
<class 'str'>


### Casting

In [None]:
x = str(3)
y = int("3")
z = float(3)
print(x, y, z, sep='\n')

3
3
3.0


### Boolean Variables

In [None]:
a = True
b = False
print(a, b, int(a), int(b))
print(type(a))

True False 1 0
<class 'bool'>


### Lists

In [None]:
# a list can contain items of different types
l = [1, "NLP", 2.3]
print('This is the original list:                               ', l)

# add item to list
l.append("4")
print('This is the list after adding "4" to it:                 ', l)

# extend list with another list
l.extend([5, 6])
print('This is the list after extending it with [5, 6]:         ', l)

# insert item to list at a specific index
l.insert(2, 0)
print('This is the list after inserting "0" at index "2":       ', l)

# remove item from list
l.remove("NLP")
print('This is the list after removing "NLP" from it:             ', l)

# remove item from list by index
d=l.pop(3)
print(f"The list after removing the item of index 3, which is {d}: {l}")

# sort list
l.sort()
print('This is the list after sorting it:                       ', l)

# reverse list
l.reverse()
print('This is the list after reversing it:                     ', l)


# get count of items in a list
print('number of items in the list:                             ', len(l))

# access items in a list (zero-indexed)
print('first item in the list:                                  ', l[0])

# access the last item in a list
print('last item in the list:                                   ', l[-1])

# access a slice from the list
print('from item of index "1" to item of index "2":             ', l[1:3])
print('from the first item to item of index "1":                ', l[:2])
print('from item of index "1" to the last item:                 ', l[1:])

# set item in a list
l[1] = 2

print('the list after changing the second item from "NLP" to 2: ', l)

# remove item from list
del l[0]

print('the list after removing the first item:                  ', l)


This is the original list:                                [1, 'NLP', 2.3]
This is the list after adding "4" to it:                  [1, 'NLP', 2.3, '4']
This is the list after extending it with [5, 6]:          [1, 'NLP', 2.3, '4', 5, 6]
This is the list after inserting "0" at index "2":        [1, 'NLP', 0, 2.3, '4', 5, 6]
This is the list after removing "NLP" from it:              [1, 0, 2.3, '4', 5, 6]
The list after removing the item of index 3, which is 4: [1, 0, 2.3, 5, 6]
This is the list after sorting it:                        [0, 1, 2.3, 5, 6]
This is the list after reversing it:                      [6, 5, 2.3, 1, 0]
number of items in the list:                              5
first item in the list:                                   6
last item in the list:                                    0
from item of index "1" to item of index "2":              [5, 2.3]
from the first item to item of index "1":                 [6, 5]
from item of index "1" to the last item:            

### Dictionaries
Dictionaries store data as (`key`, `value`) pairs of arbitrary types.

In [None]:
d = {
        "1": 1,
        "2": 2,
        3: "3",
        4.5:"4.5"
    }
print('item with a string key:              ', d["1"])
print('item with an int key:                ', d[3])
print('item with a float key:               ', d[4.5])

# get value of a specific key
print('value of key 4.5:                    ', d.get(4.5))

# add item to the dictionary
d[5] = "5"
print('dictionary after adding an item:     ', d)

# update item in the dictionary
d[5] = "five"
print('dictionary after updating an item:   ', d)

# print all keys in the dictionary
print('keys in the dictionary:              ', d.keys())

# print all values in the dictionary
print('values in the dictionary:            ', d.values())

# print all items in the dictionary
print('items in the dictionary:             ', d.items())

# pop item from the dictionary
item=d.pop('1')
print(f"dictionary after popping the item with key '1', which is {item}: {d}")


# delete item with a specific key
del d["2"]
print('dictionary after deleting an item:   ', d)


item with a string key:               1
item with an int key:                 3
item with a float key:                4.5
value of key 4.5:                     4.5
dictionary after adding an item:      {'1': 1, '2': 2, 3: '3', 4.5: '4.5', 5: '5'}
dictionary after updating an item:    {'1': 1, '2': 2, 3: '3', 4.5: '4.5', 5: 'five'}
keys in the dictionary:               dict_keys(['1', '2', 3, 4.5, 5])
values in the dictionary:             dict_values([1, 2, '3', '4.5', 'five'])
items in the dictionary:              dict_items([('1', 1), ('2', 2), (3, '3'), (4.5, '4.5'), (5, 'five')])
dictionary after popping the item with key '1', which is 1: {'2': 2, 3: '3', 4.5: '4.5', 5: 'five'}
dictionary after deleting an item:    {3: '3', 4.5: '4.5', 5: 'five'}


### Tuples
Tuples are `immutable` objects, lists are `mutable`.

Tuples cannot be changed while lists can.

In [None]:
t = (1, 2.5, "3")
print('original tuple:  ', t)
print('first item       ', t[0])
print()

"""
Tuple is immutable, although you can use the + operator to concatenate several tuples.
 The old object is still present at this point, and a new object is created.
"""
t = t + (4,)
print('after adding 4:  ', t)

# tuples can be a key of a dict
d = {t: "tuple"}
print(d[t])
print(d)

original tuple:   (1, 2.5, '3')
first item        1

after adding 4:   (1, 2.5, '3', 4)
tuple
{(1, 2.5, '3', 4): 'tuple'}


In [None]:
# this will get an error (recall: tuple are immutable)
# tuples are immutable so you can't change them
t[0]=2

TypeError: 'tuple' object does not support item assignment

### Sets
- `Unchangeable`: same as tuples
- `unindexed`: we cannot access a specific index
- `unique values only`: no duplicate values

In [None]:
s = {"a", "b", "c"}
print(s)

# add item to the set
s.add(5)
# remove item from the set
s.remove("a")
print(s)

# try adding duplicate items
s = {5, "b", "c", "b"}
print(s)

{'c', 'a', 'b'}
{'c', 5, 'b'}
{'c', 5, 'b'}


In [None]:
# this will get an error (recall: sets not subscriptable)
s[0]

TypeError: 'set' object is not subscriptable

### Conversions

In [None]:
# convert a string to a list of characters
l=list("abc def")

print(l)

['a', 'b', 'c', ' ', 'd', 'e', 'f']


In [None]:
s= set(["one","two","one"])
print(s)

{'two', 'one'}


In [None]:
w="words with spaces".split() # split the string into a list of words
print(w)

# split the string on a specific character (,)
n="1,4,8,2".split(",")
print(n)

['words', 'with', 'spaces']
['1', '4', '8', '2']


In [None]:
# join the list of words into a string
print(' '.join(w) )

# join the list of words into a string with a comma between them
print(','.join(n))

words with spaces
1,4,8,2


### Arithmetic Operators

In [None]:
x = 2
y = 3
print(x + y)
print(x - y)
print(y / x)
print(y // x)
print(x * y)
print(y % x)
print(x**y)

5
-1
1.5
1
6
1
8


### Assignment operators

In [None]:
x = 2
print(x)
x += 1
print(x)
x -= 1
print(x)
x /= 2
print(x)
x *= 3
print(x)
x //= 2
print(x)

2
3
2
1.0
3.0
1.0


### Comparison operators

In [None]:
x = 3
y = 2
print(x == y)
print(x != y)
print(x >= y)
print(x <= y)
print(x > y)
print(x < y)

False
True
True
False
True
False


### Logical Operators

In [None]:
x = True
y = False
print(x and y)
print(x or y)
print(not y)

False
True
True


# Flow Control
`Blocks` in `Python` are structured using `indentation`

### `if`-`elif`-`else`

In [None]:
x = 2
y = 3
if x > y:
    # here indentation is important
    print ("x > y")
elif x < y:
    print("x < y")
else:
    print("x = y")

x < y


### `while` loops

In [None]:
# while loops
i = 0
while i < 10:
    print(i)
    i += 1

0
1
2
3
4
5
6
7
8
9


### `for` loops
for is used to iterate over sequences like lists, dictionaries, sets, strings, tuples, ...


In [None]:
l = ["1", "2", 3, 4, 5.3]
for item in l:
    print(item)

print()

# range function
print(list(range(2, 10)))
print(list(range(10)))
print(list(range(2, 10, 2)))
print()

for i in range(2, 10):
    print(i)

print()

s = "NLP"
for c in s:
    print(c)

1
2
3
4
5.3

[2, 3, 4, 5, 6, 7, 8, 9]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[2, 4, 6, 8]

2
3
4
5
6
7
8
9

N
L
P


# Functions

In [None]:
# function with no arguments
def func():
    print("in func")

In [None]:
func()

in func


In [None]:
# function with arguments
def func(a):
    print(a)

In [None]:
func(3)
func([1, 2])

3
[1, 2]


In [None]:
# function that makes some logic and returns a value
def func(a):
    return a + 1

In [None]:
f = func(3)
print(f)

4


In [None]:
# function that takes a variable number of arguments
def func(*args):
    for arg in args:
        print(arg)

In [None]:
func(1, 2, "NLP")

1
2
NLP


In [None]:
# function with 3 arguments
def func(a, b, c):
    print(a, b, c)

In [None]:
func(1, 2, 3)
func(b=2, a=1, c=3)

1 2 3
1 2 3


In [None]:
# a function with an argument that has a default value
def func(a,b=1):
    print(a,b)

In [None]:
func(2,5)
func(2)

2 5
2 1


In [None]:
# get an error (non-default argument follows default argument)
def func(b=9,a):
    print(a,b)

SyntaxError: parameter without a default follows parameter with a default (25820532.py, line 2)

In [None]:
# a function with a keyword arguments
def func(**kwargs):
    print(kwargs["subject"])

In [None]:
func(subject="NLP", section=1)

NLP


# Classes

In [None]:
class Human:
    def __init__(self, name):
        self.name = name
        print("Human created")

    def __str__(self):
        return f"My name is {self.name}"

    def changeName(self, name):
        print(f"replacing {self.name} with {name}")
        self.name = name

In [None]:
h = Human("omar")
print(h)
h.changeName("mohamed")
print(h)

Human created
My name is omar
replacing omar with mohamed
My name is mohamed


In [None]:
class Student(Human):
    def __init__(self, name, year):
        Human.__init__(self, name)
        self.year = year
        print("Student created")

    def __str__(self):
        return f"Student name: {self.name}\nIn year: {self.year}"

    def passed(self):
        self.year += 1

In [None]:
s = Student("omar", 4)
print(s)
s.passed()
print(s)

Human created
Student created
Student name: omar
In year: 4
Student name: omar
In year: 5


# Numpy
Numpy is the core library for scientific computing in Python. It provides a high-performance multidimensional array object, and tools for working with these arrays.

In [None]:
import numpy as np

## Arrays

A numpy array is a grid of values, all of the same type, and is indexed by a tuple of nonnegative integers. The number of dimensions is the rank of the array; the shape of an array is a tuple of integers giving the size of the array along each dimension.

We can initialize numpy arrays from nested Python lists, and access elements using square brackets:

In [None]:
a = np.array([1, 2, 3])  # Create a rank 1 array
print(type(a), a.shape, a[0], a[1], a[2])
a[0] = 5                 # Change an element of the array
print(a)

<class 'numpy.ndarray'> (3,) 1 2 3
[5 2 3]


In [None]:
b = np.array([[1,2,3],[4,5,6]])   # Create a rank 2 array
print(b)

[[1 2 3]
 [4 5 6]]


In [None]:
print(b.shape)
print(b[0, 0], b[0, 1], b[1, 0])

(2, 3)
1 2 4


Numpy also provides many functions to create arrays:

In [None]:
a = np.zeros((2,2))  # Create an array of all zeros
print(a)

[[0. 0.]
 [0. 0.]]


In [None]:
b = np.ones((1,2))   # Create an array of all ones
print(b)

[[1. 1.]]


In [None]:
c = np.full((2,2), 7) # Create a constant array
print(c)

[[7 7]
 [7 7]]


In [None]:
d = np.eye(2)        # Create a 2x2 identity matrix
print(d)

[[1. 0.]
 [0. 1.]]


In [None]:
e = np.random.random((2,2)) # Create an array filled with random values
print(e)

[[0.33703282 0.18304777]
 [0.99886052 0.44683842]]


## Array Indexing

Slicing: Similar to Python lists, numpy arrays can be sliced. Since arrays may be multidimensional, you must specify a slice for each dimension of the array:

In [None]:
import numpy as np

# Create the following rank 2 array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]
print(b)

[[2 3]
 [6 7]]


A slice of an array is a view into the same data, so modifying it will modify the original array.

In [None]:
print(a[0, 1])
b[0, 0] = 77    # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1])

2
77


In [None]:
# Create the following rank 2 array with shape (3, 4)
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
print(a)

row_r1 = a[1, :]    # Rank 1 view of the second row of a
row_r2 = a[1:2, :]  # Rank 2 view of the second row of a
row_r3 = a[[1], :]  # Rank 2 view of the second row of a
print(row_r1, row_r1.shape)
print(row_r2, row_r2.shape)
print(row_r3, row_r3.shape)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
[5 6 7 8] (4,)
[[5 6 7 8]] (1, 4)
[[5 6 7 8]] (1, 4)


In [None]:
# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(col_r1, col_r1.shape)
print()
print(col_r2, col_r2.shape)

[ 2  6 10] (3,)

[[ 2]
 [ 6]
 [10]] (3, 1)


In [None]:
a = np.array([[1,2], [3, 4], [5, 6]])

# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]])

# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]]))

[1 4 5]
[1 4 5]


In [None]:
# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]])

# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]]))

[2 2]
[2 2]


Boolean array indexing: Boolean array indexing lets you pick out arbitrary elements of an array. Frequently this type of indexing is used to select the elements of an array that satisfy some condition. Here is an example:

In [None]:
import numpy as np

a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)  # Find the elements of a that are bigger than 2;
                    # this returns a numpy array of Booleans of the same
                    # shape as a, where each slot of bool_idx tells
                    # whether that element of a is > 2.

print(bool_idx)

[[False False]
 [ True  True]
 [ True  True]]


In [None]:
# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])

# We can do all of the above in a single concise statement:
print(a[a > 2])

[3 4 5 6]
[3 4 5 6]


## Datatypes

In [None]:
x = np.array([1, 2])  # Let numpy choose the datatype
y = np.array([1.0, 2.0])  # Let numpy choose the datatype
z = np.array([1, 2], dtype=np.int64)  # Force a particular datatype

print(x.dtype, y.dtype, z.dtype)

int64 float64 int64


## Array Math

In [None]:
x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
print(x + y)
print(np.add(x, y))

[[ 6.  8.]
 [10. 12.]]
[[ 6.  8.]
 [10. 12.]]


In [None]:
# Elementwise difference; both produce the array
print(x - y)
print(np.subtract(x, y))

[[-4. -4.]
 [-4. -4.]]
[[-4. -4.]
 [-4. -4.]]


In [None]:
# Elementwise product; both produce the array
print(x * y)
print(np.multiply(x, y))

[[ 5. 12.]
 [21. 32.]]
[[ 5. 12.]
 [21. 32.]]


In [None]:
# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]
print(x / y)
print(np.divide(x, y))

[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]


In [None]:
# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

[[1.         1.41421356]
 [1.73205081 2.        ]]


In [None]:
x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11, 12])

# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

219
219


In [None]:
# Equavilant to np.dot
print(v @ w)

219


In [None]:
x = np.array([[1,2],[3,4]])

print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

10
[4 6]
[3 7]


In [None]:
print(x)
print("transpose\n", x.T)

[[1 2]
 [3 4]]
transpose
 [[1 3]
 [2 4]]


In [None]:
v = np.array([[1,2,3]])
print(v)
print("transpose\n", v.T)

[[1 2 3]]
transpose
 [[1]
 [2]
 [3]]


## Broadcasting

Broadcasting is a powerful mechanism that allows numpy to work with arrays of different shapes when performing arithmetic operations. Frequently we have a smaller array and a larger array, and we want to use the smaller array multiple times to perform some operation on the larger array.

In [None]:
# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = np.empty_like(x)   # Create an empty matrix with the same shape as x

# Add the vector v to each row of the matrix x with an explicit loop
for i in range(4):
    y[i, :] = x[i, :] + v

print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


This works; however when the matrix x is very large, computing an explicit loop in Python could be slow. Note that adding the vector v to each row of the matrix x is equivalent to forming a matrix vv by stacking multiple copies of v vertically, then performing elementwise summation of x and vv. We could implement this approach like this:

In [None]:
vv = np.tile(v, (4, 1))  # Stack 4 copies of v on top of each other
print(vv)                # Prints "[[1 0 1]
                         #          [1 0 1]
                         #          [1 0 1]
                         #          [1 0 1]]"

[[1 0 1]
 [1 0 1]
 [1 0 1]
 [1 0 1]]


In [None]:
y = x + vv  # Add x and vv elementwise
print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


Numpy broadcasting allows us to perform this computation without actually creating multiple copies of v. Consider this version, using broadcasting:

In [None]:
import numpy as np

# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v  # Add v to each row of x using broadcasting
print(y)

[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


# Towards Text Processing

In [None]:
# Natural Language Toolkit
# !pip install nltk

In [None]:
import nltk
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_tree

True

In [None]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [None]:
type(text1)

nltk.text.Text

In [None]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [None]:
# search for a specific word in a text
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [None]:
# the number of tokens in a given text (words and punctuation symbols)
len(text3)

44764

In [None]:
# to obtain the vocabulary of a given corpora (the unique words and punctuations)
sorted(set(text3))

['!',
 "'",
 '(',
 ')',
 ',',
 ',)',
 '.',
 '.)',
 ':',
 ';',
 ';)',
 '?',
 '?)',
 'A',
 'Abel',
 'Abelmizraim',
 'Abidah',
 'Abide',
 'Abimael',
 'Abimelech',
 'Abr',
 'Abrah',
 'Abraham',
 'Abram',
 'Accad',
 'Achbor',
 'Adah',
 'Adam',
 'Adbeel',
 'Admah',
 'Adullamite',
 'After',
 'Aholibamah',
 'Ahuzzath',
 'Ajah',
 'Akan',
 'All',
 'Allonbachuth',
 'Almighty',
 'Almodad',
 'Also',
 'Alvah',
 'Alvan',
 'Am',
 'Amal',
 'Amalek',
 'Amalekites',
 'Ammon',
 'Amorite',
 'Amorites',
 'Amraphel',
 'An',
 'Anah',
 'Anamim',
 'And',
 'Aner',
 'Angel',
 'Appoint',
 'Aram',
 'Aran',
 'Ararat',
 'Arbah',
 'Ard',
 'Are',
 'Areli',
 'Arioch',
 'Arise',
 'Arkite',
 'Arodi',
 'Arphaxad',
 'Art',
 'Arvadite',
 'As',
 'Asenath',
 'Ashbel',
 'Asher',
 'Ashkenaz',
 'Ashteroth',
 'Ask',
 'Asshur',
 'Asshurim',
 'Assyr',
 'Assyria',
 'At',
 'Atad',
 'Avith',
 'Baalhanan',
 'Babel',
 'Bashemath',
 'Be',
 'Because',
 'Becher',
 'Bedad',
 'Beeri',
 'Beerlahairoi',
 'Beersheba',
 'Behold',
 'Bela',
 'Belah

In [None]:
print(len(set(text3)))
print(len(text3))
print(len(set(text3)) / len(text3) * 100)
# what do you reveal comparing then number of tokens with the vocabulary number?

2789
44764
6.230453042623537


In [None]:
# to compute the number of occurances of a specific word
text3.count("I")

484

# Let's think about Text

A text is a sequence of words and character (tokens) separated by white spaces, new lines, ...
We can simply represent any corpora as a sequence of tokens so in python it is simply a list. This is how nltk represents text corporas

In [None]:
sent1 = ['Call', 'me', 'Ishmael', '.']

# to see the total number of tokens
print(len(sent1))

4


In [None]:
sent2

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.']

In [None]:
sent3

['In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [None]:
# to concatenate two sentences (lists)
sent2 + sent3

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.',
 'In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [None]:
# to add new token to a sentence
sent1.append('Some')
sent1

['Call', 'me', 'Ishmael', '.', 'Some']

In [None]:
# to index a sentence (list) by index
text4[173]

'awaken'

In [None]:
# to get the first index of a specific word
text4.index('awaken')

173

In [None]:
# You can use list slicing to get a part of the text
text5[16715:16735]

['U86',
 'thats',
 'why',
 'something',
 'like',
 'gamefly',
 'is',
 'so',
 'good',
 'because',
 'you',
 'can',
 'actually',
 'play',
 'a',
 'full',
 'game',
 'without',
 'buying',
 'it']

In [None]:
# To calculate the word count of a corpora
fdist1 = FreqDist(text1)
print(type(fdist1))
fdist1.most_common(10)

<class 'nltk.probability.FreqDist'>


[(',', 18713),
 ('the', 13721),
 ('.', 6862),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4072),
 ('in', 3916),
 ('that', 2982)]

In [None]:
fdist1['whale']

906

In [None]:
# To filter words based on the word length
# let's get the words having more than 15 character
long_words = [w for w in set(text1) if len(w) > 15]
long_words

['irresistibleness',
 'undiscriminating',
 'responsibilities',
 'simultaneousness',
 'uncomfortableness',
 'cannibalistically',
 'preternaturalness',
 'indispensableness',
 'apprehensiveness',
 'circumnavigation',
 'uninterpenetratingly',
 'physiognomically',
 'comprehensiveness',
 'circumnavigating',
 'subterraneousness',
 'indiscriminately',
 'circumnavigations',
 'supernaturalness',
 'Physiognomically',
 'hermaphroditical',
 'CIRCUMNAVIGATION',
 'uncompromisedness',
 'characteristically',
 'superstitiousness']

In [None]:
# what about filtering based on the word frequency in the corpora?
fdist5 = FreqDist(text5)
common_words = [w for w in set(text5) if fdist5[w] > 500]
common_words

['.', 'i', 'you', 'hi', 'I', ',', '?', 'JOIN', 'PART', 'lol', 'a', 'to', 'the']

In [None]:
# python built-in string comaprison operators

# checks if string starts with sub-string
print("omar".startswith('o'))
# checks if string ends with sub-string
print("omar".endswith('r'))
# checks if string is sub-string of another
print("ma" in "omar")
# checks if all characters in the string are lowercase
print("omar".islower())
# checks if all characters in the string are uppercase
print("OMAR".isupper())
# checks if all the characters in a string are alphabetic characters (a-z) only
print("omar".isalpha())
# checks if all the characters are alphanumeric (a-z, 0-9) only
print("omar1".isalnum())
# checks if all the characters are numeric (0-9) only
print("123".isdigit())
# checks if the string is title-cased. (all words in a string begin with uppercase letters and the remaining characters are lowercase letters)
print("Introduction To Python".istitle())

True
True
True
True
True
True
True
True
True


# Towards Text Processing

In [None]:
# Natural Language Toolkit
# !pip install nltk

In [None]:
import nltk
nltk.download('book')

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2000.zip.
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /home/omarsgalal/nltk_data...
[nltk_data]    |   Unzipping corpora/conll2002.zip.
[nltk_data]    | Downloading package dependency_tree

True

In [None]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [None]:
type(text1)

nltk.text.Text

In [None]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [None]:
# search for a specific word in a text
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [None]:
# the number of tokens in a given text (words and punctuation symbols)
len(text3)

44764

In [None]:
# to obtain the vocabulary of a given corpora (the unique words and punctuations)
sorted(set(text3))

['!',
 "'",
 '(',
 ')',
 ',',
 ',)',
 '.',
 '.)',
 ':',
 ';',
 ';)',
 '?',
 '?)',
 'A',
 'Abel',
 'Abelmizraim',
 'Abidah',
 'Abide',
 'Abimael',
 'Abimelech',
 'Abr',
 'Abrah',
 'Abraham',
 'Abram',
 'Accad',
 'Achbor',
 'Adah',
 'Adam',
 'Adbeel',
 'Admah',
 'Adullamite',
 'After',
 'Aholibamah',
 'Ahuzzath',
 'Ajah',
 'Akan',
 'All',
 'Allonbachuth',
 'Almighty',
 'Almodad',
 'Also',
 'Alvah',
 'Alvan',
 'Am',
 'Amal',
 'Amalek',
 'Amalekites',
 'Ammon',
 'Amorite',
 'Amorites',
 'Amraphel',
 'An',
 'Anah',
 'Anamim',
 'And',
 'Aner',
 'Angel',
 'Appoint',
 'Aram',
 'Aran',
 'Ararat',
 'Arbah',
 'Ard',
 'Are',
 'Areli',
 'Arioch',
 'Arise',
 'Arkite',
 'Arodi',
 'Arphaxad',
 'Art',
 'Arvadite',
 'As',
 'Asenath',
 'Ashbel',
 'Asher',
 'Ashkenaz',
 'Ashteroth',
 'Ask',
 'Asshur',
 'Asshurim',
 'Assyr',
 'Assyria',
 'At',
 'Atad',
 'Avith',
 'Baalhanan',
 'Babel',
 'Bashemath',
 'Be',
 'Because',
 'Becher',
 'Bedad',
 'Beeri',
 'Beerlahairoi',
 'Beersheba',
 'Behold',
 'Bela',
 'Belah

In [None]:
print(len(set(text3)))
print(len(text3))
print(len(set(text3)) / len(text3) * 100)
# what do you reveal comparing then number of tokens with the vocabulary number?

2789
44764
6.230453042623537


In [None]:
# to compute the number of occurances of a specific word
text3.count("I")

484

# Let's think about Text

A text is a sequence of words and character (tokens) separated by white spaces, new lines, ...
We can simply represent any corpora as a sequence of tokens so in python it is simply a list. This is how nltk represents text corporas

In [None]:
sent1 = ['Call', 'me', 'Ishmael', '.']

# to see the total number of tokens
print(len(sent1))

4


In [None]:
sent2

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.']

In [None]:
sent3

['In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [None]:
# to concatenate two sentences (lists)
sent2 + sent3

['The',
 'family',
 'of',
 'Dashwood',
 'had',
 'long',
 'been',
 'settled',
 'in',
 'Sussex',
 '.',
 'In',
 'the',
 'beginning',
 'God',
 'created',
 'the',
 'heaven',
 'and',
 'the',
 'earth',
 '.']

In [None]:
# to add new token to a sentence
sent1.append('Some')
sent1

['Call', 'me', 'Ishmael', '.', 'Some']

In [None]:
# to index a sentence (list) by index
text4[173]

'awaken'

In [None]:
# to get the first index of a specific word
text4.index('awaken')

173

In [None]:
# You can use list slicing to get a part of the text
text5[16715:16735]

['U86',
 'thats',
 'why',
 'something',
 'like',
 'gamefly',
 'is',
 'so',
 'good',
 'because',
 'you',
 'can',
 'actually',
 'play',
 'a',
 'full',
 'game',
 'without',
 'buying',
 'it']

In [None]:
# To calculate the word count of a corpora
fdist1 = FreqDist(text1)
print(type(fdist1))
fdist1.most_common(10)

<class 'nltk.probability.FreqDist'>


[(',', 18713),
 ('the', 13721),
 ('.', 6862),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4072),
 ('in', 3916),
 ('that', 2982)]

In [None]:
fdist1['whale']

906

In [None]:
# To filter words based on the word length
# let's get the words having more than 15 character
long_words = [w for w in set(text1) if len(w) > 15]
long_words

['irresistibleness',
 'undiscriminating',
 'responsibilities',
 'simultaneousness',
 'uncomfortableness',
 'cannibalistically',
 'preternaturalness',
 'indispensableness',
 'apprehensiveness',
 'circumnavigation',
 'uninterpenetratingly',
 'physiognomically',
 'comprehensiveness',
 'circumnavigating',
 'subterraneousness',
 'indiscriminately',
 'circumnavigations',
 'supernaturalness',
 'Physiognomically',
 'hermaphroditical',
 'CIRCUMNAVIGATION',
 'uncompromisedness',
 'characteristically',
 'superstitiousness']

In [None]:
# what about filtering based on the word frequency in the corpora?
fdist5 = FreqDist(text5)
common_words = [w for w in set(text5) if fdist5[w] > 500]
common_words

['.', 'i', 'you', 'hi', 'I', ',', '?', 'JOIN', 'PART', 'lol', 'a', 'to', 'the']

In [None]:
# python built-in string comaprison operators

# checks if string starts with sub-string
print("omar".startswith('o'))
# checks if string ends with sub-string
print("omar".endswith('r'))
# checks if string is sub-string of another
print("ma" in "omar")
# checks if all characters in the string are lowercase
print("omar".islower())
# checks if all characters in the string are uppercase
print("OMAR".isupper())
# checks if all the characters in a string are alphabetic characters (a-z, A-Z) only
print("omar".isalpha())
# checks if all the characters are alphanumeric (a-z, A-Z, 0-9) only
print("omar1".isalnum())
# checks if all the characters are numeric (0-9) only
print("123".isdigit())
# checks if the string is title-cased. (all words in a string begin with uppercase letters and the remaining characters are lowercase letters)
print("Introduction To Python".istitle())

True
True
True
True
True
True
True
True
False
