# İntro to data structures

In [3]:
import numpy as np
import pandas as pd

## Series

In [5]:
# From ndarray
s = pd.Series(np.random.randn(5), index=["a","b","c","d","e"])
print(s)
print(s.index)

a    0.654457
b    1.800125
c   -0.329065
d    1.619839
e   -0.734587
dtype: float64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [None]:
# From dictionary
d = {"b" : 1, "a" : 0, "c" : 5}
s = pd.Series(d)
print(s)
#If an index is passed, the values in data corresponding 
#to the labels in the index will be pulled out
s = pd.Series(d, index=["a","d","c","b"])
print(s) #NaN(not o number is the standart missing data marker in pandas)

b    1
a    0
c    5
dtype: int64
a    0.0
d    NaN
c    5.0
b    1.0
dtype: float64


In [10]:
# From scalar value
s = pd.Series(5.0, index=["a","b","c","d"])
print(s)

a    5.0
b    5.0
c    5.0
d    5.0
dtype: float64


## Series is ndarray-like

In [25]:
s = pd.Series(np.arange(4), index=["a","b","c","d"])
print(s)
#print(s[2]) this doesnt work
print(s.iloc[:3])
print(s[s > s.median()])
print(s.iloc[[3,1,2]])

a    0
b    1
c    2
d    3
dtype: int64
a    0
b    1
c    2
dtype: int64
c    2
d    3
dtype: int64
d    3
b    1
c    2
dtype: int64


In [26]:
# Like a numpy array, a pandas series has a single data type
print(s.dtype)
# if u need actual ndarray u can simply:
c = s.to_numpy()
print(c)
print(type(c))

int64
[0 1 2 3]
<class 'numpy.ndarray'>


## Series is dict-like

In [35]:
s = pd.Series(np.arange(4), index=["a","b","c","d"])
print(s["a"])
s["d"] = -9
print(s)
print("e" in s)
print("c" in s)
#s["x"] this will cause an exception
print(s.get("x")) #this returns nothing
print(s.get("x",np.nan)) # this returns nan
# Labels can also be accessed by attribute
print(s.a)

0
a    0
b    1
c    2
d   -9
dtype: int64
False
True
None
nan
0


## Vectorized operations and label alignment with Series

In [38]:
s = pd.Series(np.arange(4), index=["a","b","c","d"])
print(s+s)
print(s*20)
print(np.exp(s)) # Series can also be passed into most numpy methods
# expecting an ndarray

a    0
b    2
c    4
d    6
dtype: int64
a     0
b    20
c    40
d    60
dtype: int64
a     1.000000
b     2.718282
c     7.389056
d    20.085537
dtype: float64


In [51]:
s = pd.Series(np.arange(4), index=["a","b","c","d"])
d = pd.Series(np.arange(12,16), index=["a","b","c","f"])
print(s)
print(d)
print(s*d) # Since the labels f and d are not shared, the values
# of for f and d were set to NaN in the result
print(s+d)
# if u dont want the Nan values, u can use dropna function
x = s*d
print(x.dropna())


a    0
b    1
c    2
d    3
dtype: int64
a    12
b    13
c    14
f    15
dtype: int64
a     0.0
b    13.0
c    28.0
d     NaN
f     NaN
dtype: float64
a    12.0
b    14.0
c    16.0
d     NaN
f     NaN
dtype: float64
a     0.0
b    13.0
c    28.0
dtype: float64


## Name attribute

In [55]:
s = pd.Series(np.arange(5), name="something")
print(s)
print(s.name)
s = s.rename("changed") # u can assign to s2 (diffrent object)
print(s.name)

0    0
1    1
2    2
3    3
4    4
Name: something, dtype: int64
something
changed


# Data Frame

In [67]:
#From dict of Series or dicts
d = {
    "one":pd.Series([1.0,2.0,3.0],index=["a","b","c"]),
    "two":pd.Series([1.0,2.0,3.0,4.0],index=["a","b","c","d"]),
}
df = pd.DataFrame(d)
print(df)
df2 = pd.DataFrame(d, index=["c","d","a","x"])
print(df2)
df3 = pd.DataFrame(d, index=["d","a","b","c","x"], columns=["two","three","one","four"])
print(df3)
print("-----------------------")
print(df3.index)
print(df3.columns)

   one  two
a  1.0  1.0
b  2.0  2.0
c  3.0  3.0
d  NaN  4.0
   one  two
c  3.0  3.0
d  NaN  4.0
a  1.0  1.0
x  NaN  NaN
   two three  one four
d  4.0   NaN  NaN  NaN
a  1.0   NaN  1.0  NaN
b  2.0   NaN  2.0  NaN
c  3.0   NaN  3.0  NaN
x  NaN   NaN  NaN  NaN
-----------------------
Index(['d', 'a', 'b', 'c', 'x'], dtype='object')
Index(['two', 'three', 'one', 'four'], dtype='object')


In [73]:
#From dict of ndarrays / lists
d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]}
d2 = pd.DataFrame(d) # If index is not passed then indexes will be range(n)
# where the n is array length example:
print(d2) 
d3 = pd.DataFrame(d, index=["a","b","c","d"])
print(d3)

   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
   one  two
a  1.0  4.0
b  2.0  3.0
c  3.0  2.0
d  4.0  1.0


In [74]:
#From structured or record array

Since i dont know structured numpy arrays a little study will be good
# Numpy's Structured Array
source = https://www.geeksforgeeks.org/numpy-structured-array/

In [91]:
dt = np.dtype([('name', (np.str_),10), ('age', np.int32), ('weight', np.float64)])
a = np.array([('Yavuz', 21, 67.4), ('Veli', 56, 70.0)], dtype=dt)
print(a)
print(a.dtype)

#min and max
max_age = np.max(a['age'])
min_age = np.min(a['age'])

print("Max age = ",max_age) 
print("Min age = ",min_age)

#concatenating structured array
b = np.array([('Mehmet', 34, 87)], dtype=dt)
c = np.concatenate((a,b))
print(c)

#Reshaping a structured array
reshaped_c = np.reshape(c, (3,1))
print(reshaped_c)

[('Yavuz', 21, 67.4) ('Veli', 56, 70. )]
[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]
Max age =  56
Min age =  21
[('Yavuz', 21, 67.4) ('Veli', 56, 70. ) ('Mehmet', 34, 87. )]
[[('Yavuz', 21, 67.4)]
 [('Veli', 56, 70. )]
 [('Mehmet', 34, 87. )]]


In [26]:
#From structured or record array
import numpy as np
import pandas as pd
data = np.zeros((2,), dtype=[("a", "i4"), ("b", "f4"), ("c", "S10")])
data[:] = [(1,2.0,"Hello"), (2, 3.0, "World")]
print(pd.DataFrame(data))
print(pd.DataFrame(data, index=["first","second"]))
print(pd.DataFrame(data, columns=["b","c","a"]))

   a    b         c
0  1  2.0  b'Hello'
1  2  3.0  b'World'
        a    b         c
first   1  2.0  b'Hello'
second  2  3.0  b'World'
     b         c  a
0  2.0  b'Hello'  1
1  3.0  b'World'  2


In [29]:
#From a list of dicts
data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}]
print(pd.DataFrame(data2))
print(pd.DataFrame(data2, index=["first","second"]))
print(pd.DataFrame(data2, columns=["c","a"]))

   a   b     c
0  1   2   NaN
1  5  10  20.0
        a   b     c
first   1   2   NaN
second  5  10  20.0
      c  a
0   NaN  1
1  20.0  5


In [30]:
#From a dict of tuples
pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [None]:
#From a Series