In [2]:
import numpy as np
import pandas as pd

#NaN floating point values that represents missing data.
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [2]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [3]:
float_data.isna() #Lets us know which values are na in the Series

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [5]:
data.dropna()#drops all na data from series

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [7]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
data.dropna() #drop na in a DataFrame instead of a series 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [10]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))

In [11]:
df

Unnamed: 0,0,1,2
0,0.448326,0.14768,-0.238478
1,-1.202121,-0.085851,0.146978
2,0.831328,-0.185866,1.631025
3,1.049643,-1.598645,1.620998
4,1.906434,0.007014,0.671049
5,0.267111,1.515425,-0.032925
6,0.983485,-0.00327,-0.789788


In [13]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.448326,0.14768,-0.238478
1,-1.202121,-0.085851,0.146978
2,0.831328,-0.185866,1.631025
3,1.049643,-1.598645,1.620998
4,1.906434,0.007014,0.671049
5,0.267111,1.515425,-0.032925
6,0.983485,-0.00327,-0.789788


In [14]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.448326,0.14768,-0.238478
1,-1.202121,-0.085851,0.146978
2,0.831328,-0.185866,1.631025
3,1.049643,-1.598645,1.620998
4,1.906434,0.007014,0.671049
5,0.267111,1.515425,-0.032925
6,0.983485,-0.00327,-0.789788


In [15]:
 data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                      "k2": [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [16]:
data.duplicated() #tells if there are duplicate values or not

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [17]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [18]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
                             "pastrami", "corned beef", "bacon",
                            "pastrami", "honey ham", "nova lox"],
                    "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [19]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [20]:
meat_to_animal = {
      "bacon": "pig",
      "pulled pork": "pig",
      "pastrami": "cow",
      "corned beef": "cow",
      "honey ham": "pig",
      "nova lox": "salmon"
}

data["animal"] = data["food"].map(meat_to_animal) #adds a row to data from called animal;

In [21]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [22]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [23]:
data.replace(-999, np.nan) #replaces -999 in Series with NaN. repleace(value to replace, value to replace it with)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [3]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                                 index=["Ohio", "Colorado", "New York"],
                                 columns=["one", "two", "three", "four"])


In [4]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [6]:
def transform(x):
     return x[:4].upper()
    

data.index.map(transform) # transforms every value into an uppercase version with only 4 spaces


Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [7]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                               "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [9]:
val = "a,b,  guido"

In [10]:
 val.split(",")

['a', 'b', '  guido']

In [12]:
pieces = [x.strip() for x in val.split(",")]

first, second, third = pieces

In [13]:
 pieces

['a', 'b', 'guido']

In [14]:
 first + "::" + second + "::" + third

'a::b::guido'

In [15]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",
                 "Rob": "rob@gmail.com", "Wes": np.nan}
data = pd.Series(data)

In [16]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [17]:
type(data)

pandas.core.series.Series

In [18]:
data.isna() #Checks to see if there are any na vakues in data

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [19]:
data.str.contains("gmail")#checks to see if gmail is any of the data

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object