# Data Series Creation in Pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math
import random as rnd

In [2]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [4]:
weekdaysSeries = pd.Series(weekdays)
weekdaysSeries

0       Monday
1      Tuesday
2    Wednesday
3     Thursday
4       Friday
5     Saturday
6       Sunday
dtype: object

In [5]:
freeDays = [False, False, False, False, False, True, True]

In [6]:
freeDaysSeries = pd.Series(freeDays)
freeDaysSeries

0    False
1    False
2    False
3    False
4    False
5     True
6     True
dtype: bool

In [7]:
holidays = {'New Year': '01-01',
           'Independence Day': '07-04',
           'Christmas': '12-25'}

In [8]:
holidaysSeries = pd.Series(holidays)
holidaysSeries

New Year            01-01
Independence Day    07-04
Christmas           12-25
dtype: object

# Data Series attributes

In [19]:
dataAsFloatList = [i * rnd.random() for i in range(100001)]
dataAsFloatSeries = pd.Series(dataAsFloatList)

In [21]:
dataAsFloatSeries.size


100001

In [22]:
dataAsFloatSeries.nbytes

800008

In [23]:
dataAsFloatSeries.shape

(100001,)

In [34]:
dataAsFloatSeries.dtypes

dtype('float64')

In [24]:
dataAsFloatSeries.axes

[RangeIndex(start=0, stop=100001, step=1)]

In [25]:
dataAsFloatSeries.index

RangeIndex(start=0, stop=100001, step=1)

In [26]:
dataAsFloatSeries.is_unique

True

In [27]:
dataAsFloatSeries.is_monotonic_increasing

False

In [29]:
dataAsFloatSeries.is_monotonic_decreasing

False

In [30]:
dataAsStringList = [str(i* rnd.random()) for i in range(100001)]
dataasStringSeries = pd.Series(dataAsStringList)

In [31]:
dataasStringSeries.size

100001

In [32]:
dataasStringSeries.nbytes

800008

In [33]:
dataasStringSeries.dtypes

dtype('O')

# Data Series methods

In [63]:
cities = ['New York', 'Los Angeles', 'Chicago']
population = [8419600, 3980400, 2716000]

In [64]:
citypop = pd.Series(index = cities, data = population)
citypop

New York       8419600
Los Angeles    3980400
Chicago        2716000
dtype: int64

In [40]:
round(citypop.mean())

5038667

In [41]:
citypop.sum()

np.int64(15116000)

In [42]:
citypop.index

Index(['New York', 'Los Angeles', 'Chicago'], dtype='object')

In [65]:
citypop.keys()

Index(['New York', 'Los Angeles', 'Chicago'], dtype='object')

In [67]:
citypop.values

array([8419600, 3980400, 2716000])

# Data Series filtering

In [47]:
age = ["less than 6", "7-14", "15-17", "18-24", "25-39", "40-59", "more than 60"]
values = [14 ,334,312,5823,9491,7486,4343]
incidentsSeries = pd.Series(data = values, index = age)
incidentsSeries

less than 6       14
7-14             334
15-17            312
18-24           5823
25-39           9491
40-59           7486
more than 60    4343
dtype: int64

In [54]:
newIncidentsSeries = incidentsSeries.where(incidentsSeries > 1000).dropna()
#original series is not modified
print(newIncidentsSeries)
print(incidentsSeries)

18-24           5823.0
25-39           9491.0
40-59           7486.0
more than 60    4343.0
dtype: float64
less than 6       14
7-14             334
15-17            312
18-24           5823
25-39           9491
40-59           7486
more than 60    4343
dtype: int64


In [55]:
incidentsSeries.filter(items=['18-24', '25-39', '40-59'])

18-24    5823
25-39    9491
40-59    7486
dtype: int64

In [57]:
incidentsSeries.where(incidentsSeries <= 1000, inplace=True)
incidentsSeries.dropna(inplace=True)
incidentsSeries

less than 6     14.0
7-14           334.0
15-17          312.0
dtype: float64

In [60]:
namesList = ['Albania','Austria','Belarus',
'Belgium','Bulgaria','Croatia','Cyprus','Czech Republic','Denmark','Estonia',
'Finland','France','Germany','Greece','Hungary','Iceland','Ireland','Italy',
'Latvia','Lithuania','Luxembourg','Macedonia','Malta','Montenegro','Netherlands',
'Norway','Poland','Portugal','Romania','Russia','Serbia','Slovenia','Spain', 'Sweden',
             'Switzerland','United Kingdom','Turkey','Ukraine']
energy2010List = [1947,8347,3564,8369,4560,3814,4623,6348,6328,6506,16483,7736,7264,5318,3876,
                  51440,5911,5494,3230,3471,16830,3521,4171,5420,7010,24891,3797,4959,2551,
                  6410,4359,6521,5707,14934,8175,2498,3550,5701]
energy2012List = [2118,8507,3698,7987,4762,3819,4057,6305,6039,6689,15687,7344,7270,5511,3919,
                  53203,5665,5398,3588,3608,14696,3626,4761,5416,6871,23658,3899,4736,2604,
                  6617,4387,6778,5573,14290,7886,2794,3641,5452]

nameSeries = pd.Series(namesList)
energy2010Series = pd.Series(energy2010List)
energy2012Series = pd.Series(energy2012List)

In [62]:
mean2010 = energy2010Series.mean()
mean2010

np.float64(7779.8421052631575)

In [63]:
mean2012 = energy2012Series.mean()
mean2012

np.float64(7706.815789473684)

In [69]:
filterAboveMean2010 = energy2010Series > mean2010
filterAboveMean2010

0     False
1      True
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11    False
12    False
13    False
14    False
15     True
16    False
17    False
18    False
19    False
20     True
21    False
22    False
23    False
24    False
25     True
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33     True
34     True
35    False
36    False
37    False
dtype: bool

In [70]:
filterAboveMean2012 = energy2012Series > mean2012
filterAboveMean2012

0     False
1      True
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11    False
12    False
13    False
14    False
15     True
16    False
17    False
18    False
19    False
20     True
21    False
22    False
23    False
24    False
25     True
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33     True
34     True
35    False
36    False
37    False
dtype: bool

In [73]:
nameSeries.where(filterAboveMean2010 & filterAboveMean2012).dropna()

1         Austria
3         Belgium
10        Finland
15        Iceland
20     Luxembourg
25         Norway
33         Sweden
34    Switzerland
dtype: object

In [75]:
filterBeloweMean2010 = energy2010Series < mean2010
filterBeloweMean2010

0      True
1     False
2      True
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10    False
11     True
12     True
13     True
14     True
15    False
16     True
17     True
18     True
19     True
20    False
21     True
22     True
23     True
24     True
25    False
26     True
27     True
28     True
29     True
30     True
31     True
32     True
33    False
34    False
35     True
36     True
37     True
dtype: bool

In [76]:
nameSeries.where(filterBeloweMean2010 & filterAboveMean2012).dropna()

Series([], dtype: object)

# Importing data from CSV file

In [5]:
education = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv', usecols=['FormalEducation']).squeeze()
education

0                                         Secondary school
1        Some college/university study without earning ...
2                                        Bachelor's degree
3                                          Doctoral degree
4                                          Master's degree
                               ...                        
51387                                    Bachelor's degree
51388                                      Master's degree
51389    Some college/university study without earning ...
51390                                    Bachelor's degree
51391                                    Bachelor's degree
Name: FormalEducation, Length: 51392, dtype: object

In [6]:
type(education)

pandas.core.series.Series

In [7]:
education.head()

0                                     Secondary school
1    Some college/university study without earning ...
2                                    Bachelor's degree
3                                      Doctoral degree
4                                      Master's degree
Name: FormalEducation, dtype: object

In [8]:
allinfo = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv')

In [9]:
country = allinfo['Country'].squeeze()
country

0         United States
1        United Kingdom
2        United Kingdom
3         United States
4           Switzerland
              ...      
51387     United States
51388         Venezuela
51389            Canada
51390     United States
51391           Ireland
Name: Country, Length: 51392, dtype: object

In [10]:
type(country)

pandas.core.series.Series

In [11]:
country.head()

0     United States
1    United Kingdom
2    United Kingdom
3     United States
4       Switzerland
Name: Country, dtype: object

In [12]:
filterOnlyUSA = country == 'United States'

In [13]:
filterOnlyUSA

0         True
1        False
2        False
3         True
4        False
         ...  
51387     True
51388    False
51389    False
51390     True
51391    False
Name: Country, Length: 51392, dtype: bool

In [15]:
filterOnlyUSA.head()

0     True
1    False
2    False
3     True
4    False
Name: Country, dtype: bool

In [16]:
education.where(filterOnlyUSA).dropna().head(n=10)

0              Secondary school
3               Doctoral degree
6               Master's degree
10            Bachelor's degree
15    Primary/elementary school
17            Bachelor's degree
18            Bachelor's degree
25              Master's degree
27            Bachelor's degree
29            Bachelor's degree
Name: FormalEducation, dtype: object

In [19]:
salary = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv', usecols=['Salary']).squeeze().dropna()
salary.head()


2     113750.0
14    100000.0
17    130000.0
18     82500.0
22    100764.0
Name: Salary, dtype: float64

In [20]:
len(salary)

12891

In [21]:
min(salary)

0.0

In [23]:
list(salary.head())

[113750.0, 100000.0, 130000.0, 82500.0, 100764.0]

In [24]:
dict(salary.head())

{2: np.float64(113750.0),
 14: np.float64(100000.0),
 17: np.float64(130000.0),
 18: np.float64(82500.0),
 22: np.float64(100764.0)}

In [25]:
listSalarySorted = sorted(salary,reverse=True)

In [28]:
for i in listSalarySorted[:5]:
    print(i)

197000.0
195000.0
195000.0
195000.0
195000.0


In [29]:
salary.name = 'Salary of person'

In [30]:
salary.head()

2     113750.0
14    100000.0
17    130000.0
18     82500.0
22    100764.0
Name: Salary of person, dtype: float64

# Sorting data in Data Series using Data Series methods

In [31]:
salary = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv',usecols=['Salary']).squeeze().dropna()
salary.head()

2     113750.0
14    100000.0
17    130000.0
18     82500.0
22    100764.0
Name: Salary, dtype: float64

In [32]:
salary.sort_values(ascending=False).head(5)

31600    197000.0
6743     195000.0
41398    195000.0
18237    195000.0
1696     195000.0
Name: Salary, dtype: float64

In [33]:
salary.sort_values(ascending=True).head(5)

4517     0.0
23953    0.0
23907    0.0
2951     0.0
44841    0.0
Name: Salary, dtype: float64

In [34]:
salary.sort_values(inplace=True, ascending=False)
salary.head()

31600    197000.0
6743     195000.0
41398    195000.0
18237    195000.0
1696     195000.0
Name: Salary, dtype: float64

In [35]:
salary.sort_index(ascending=False)

51390     40000.000000
51387     58000.000000
51382     32258.064516
51378    107526.881720
51371     74193.548387
             ...      
22       100764.000000
18        82500.000000
17       130000.000000
14       100000.000000
2        113750.000000
Name: Salary, Length: 12891, dtype: float64

In [36]:
maxSalaries = salary.sort_values(ascending=False).head(100)

In [37]:
maxSalaries

31600    197000.0
6743     195000.0
41398    195000.0
18237    195000.0
1696     195000.0
           ...   
8144     170000.0
26473    170000.0
6659     170000.0
13235    170000.0
21133    170000.0
Name: Salary, Length: 100, dtype: float64

In [38]:
minSalaries = salary.sort_values(ascending=True).head(100)
minSalaries

51144      0.000000
4517       0.000000
23907      0.000000
2951       0.000000
23953      0.000000
            ...    
15179    176.185582
33212    178.784267
50569    198.208780
48674    198.649186
1077     200.000000
Name: Salary, Length: 100, dtype: float64

In [40]:
maxSalaries.mean()


np.float64(180198.1045586777)

In [41]:
minSalaries.mean()

np.float64(67.08388907133794)

# Searching for certain values in Data Series

In [42]:
countries = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv', usecols=['Country']).squeeze().dropna()
countries.head()

0     United States
1    United Kingdom
2    United Kingdom
3     United States
4       Switzerland
Name: Country, dtype: object

In [47]:
#false because this check if spain is index in countries data series
'Spain' in countries

False

In [46]:
'Spain' in countries.values

True

In [74]:
'Wonderland' in countries.values

False

# Slicing Data Series and getting values

In [77]:
surveys = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv', usecols=['CompanySize']).squeeze().dropna()

In [61]:
surveys.head()

0                         NaN
1          20 to 99 employees
2    10,000 or more employees
3    10,000 or more employees
4          10 to 19 employees
Name: CompanySize, dtype: object

In [62]:
surveys[3]

'10,000 or more employees'

In [63]:
surveys[1:11]

1           20 to 99 employees
2     10,000 or more employees
3     10,000 or more employees
4           10 to 19 employees
5                          NaN
6           20 to 99 employees
7      Fewer than 10 employees
8     5,000 to 9,999 employees
9                          NaN
10        100 to 499 employees
Name: CompanySize, dtype: object

In [64]:
surveys[12345]

'20 to 99 employees'

In [59]:
surveys[12341:12351]

15989    1,000 to 4,999 employees
15991          20 to 99 employees
15992          10 to 19 employees
15993    1,000 to 4,999 employees
15994          20 to 99 employees
15995        100 to 499 employees
15996          10 to 19 employees
15997                I don't know
15998          20 to 99 employees
15999        100 to 499 employees
Name: CompanySize, dtype: object

In [78]:
surveys.sort_values(inplace=True)

In [79]:
surveys[3]

'10,000 or more employees'

In [80]:
surveys[12345]

'20 to 99 employees'

In [81]:
surveys[12341:12351]

14554    10,000 or more employees
9429     10,000 or more employees
6384     10,000 or more employees
1102     10,000 or more employees
14549    10,000 or more employees
14547    10,000 or more employees
14570    10,000 or more employees
14575    10,000 or more employees
6410     10,000 or more employees
14576    10,000 or more employees
Name: CompanySize, dtype: object

In [82]:
surveys.reset_index(drop=True,inplace=True)

In [83]:
surveys[3]

'1,000 to 4,999 employees'

# Getting values from Data Series

In [86]:
countries = pd.read_csv("data/data/countries.csv", usecols=['Symbol','Name'],index_col='Symbol').squeeze()
countries.head(20)

Symbol
AF            Afghanistan
AL                Albania
DZ                Algeria
AD                Andorra
AO                 Angola
AI               Anguilla
AQ             Antarctica
AG    Antigua and Barbuda
SA           Saudi Arabia
AR              Argentina
AM                Armenia
AW                  Aruba
AU              Australia
AT                Austria
AZ             Azerbaijan
BS                Bahamas
BH                Bahrain
BD             Bangladesh
BB               Barbados
BE                Belgium
Name: Name, dtype: object

In [88]:
countries.loc['FR']

'France'

In [89]:
countries.iloc[13]

'Austria'

In [90]:
nordic = ['FI','SE','NO']

In [91]:
countries.loc[nordic]

Symbol
FI    Finland
SE     Sweden
NO     Norway
Name: Name, dtype: object

In [92]:
countries[nordic]

Symbol
FI    Finland
SE     Sweden
NO     Norway
Name: Name, dtype: object

# Reindex and intersection of Data Series

In [113]:
countries=pd.read_csv('data/data/countries.csv',usecols=['Symbol','Name'], index_col='Symbol').squeeze()
countries.dropna(inplace=True)
countries.head(20)

Symbol
AF            Afghanistan
AL                Albania
DZ                Algeria
AD                Andorra
AO                 Angola
AI               Anguilla
AQ             Antarctica
AG    Antigua and Barbuda
SA           Saudi Arabia
AR              Argentina
AM                Armenia
AW                  Aruba
AU              Australia
AT                Austria
AZ             Azerbaijan
BS                Bahamas
BH                Bahrain
BD             Bangladesh
BB               Barbados
BE                Belgium
Name: Name, dtype: object

In [96]:
toFind = ['BB','AA','BS']

In [98]:
countries.reindex(toFind)

Symbol
BB    Barbados
AA         NaN
BS     Bahamas
Name: Name, dtype: object

In [99]:
countries.index.intersection(toFind)

Index(['BS', 'BB'], dtype='object', name='Symbol')

In [101]:
countries.loc[countries.index.intersection(toFind)]

Symbol
BS     Bahamas
BB    Barbados
Name: Name, dtype: object

  # Defining Data Series index while importing data

In [103]:
fortune = pd.read_csv('data/data/Fortune_500_2017.csv',usecols=['Rank','Title'],index_col='Rank').squeeze()
fortune.head()

Rank
1               Walmart
2    Berkshire Hathaway
3                 Apple
4           Exxon Mobil
5              McKesson
Name: Title, dtype: object

In [104]:
fortune.head(10)

Rank
1                Walmart
2     Berkshire Hathaway
3                  Apple
4            Exxon Mobil
5               McKesson
6     UnitedHealth Group
7             CVS Health
8         General Motors
9                   AT&T
10            Ford Motor
Name: Title, dtype: object

In [105]:
fortune.tail(20)

Rank
481           Booz Allen Hamilton Holding
482                              Chemours
483    Western & Southern Financial Group
484                              Celanese
485                   Windstream Holdings
486                              Seaboard
487                             Essendant
488                                Apache
489                                Airgas
490                        Kelly Services
491                         Liberty Media
492                      Rockwell Collins
493             Robert Half International
494                             CH2M Hill
495                              Big Lots
496                         Michaels Cos.
497                         Toll Brothers
498                                 Yahoo
499                         Vistra Energy
500                        ABM Industries
Name: Title, dtype: object

In [107]:
fortune = pd.read_csv('data/data/Fortune_500_2017.csv',usecols=['Employees','Title'],index_col='Title').squeeze()

In [108]:
searchingData= ['IBM','Alphabet','Apple','Facebook']
fortune.loc[searchingData]

Title
IBM         414400
Alphabet     72053
Apple       116000
Facebook     17048
Name: Employees, dtype: int64

In [109]:
fortune.loc['IBM':'Intel']

Title
IBM                          414400
State Farm Insurance Cos.     68234
Phillips 66                   14800
Johnson & Johnson            126400
Procter & Gamble             105000
Valero Energy                  9996
Target                       323000
Freddie Mac                    5982
Lowe’s                       240000
Dell Technologies            138000
MetLife                       58000
Aetna                         49500
PepsiCo                      264000
Archer Daniels Midland        31800
UPS                          335520
Intel                        106000
Name: Employees, dtype: int64

# More Data Series methods

In [2]:
programmers = pd.read_csv('data/data/StackOverflowDeveloperSurvey2018.csv',low_memory=False, usecols=['ConvertedSalary']).squeeze().dropna()

In [3]:
programmers.mean()

np.float64(95780.86178776571)

In [6]:
programmers.median()

np.float64(55075.0)

In [7]:
programmers.std()

np.float64(202348.21562529122)

In [17]:
programmers.max()

np.float64(2000000.0)

In [8]:
fortune500 = pd.read_csv('data/data/Fortune_500_2017.csv', usecols=['Title','Employees'], index_col='Title').squeeze()
fortune500.head()

Title
Walmart               2300000
Berkshire Hathaway     367700
Apple                  116000
Exxon Mobil             72700
McKesson                68000
Name: Employees, dtype: int64

In [9]:
fortune500.idxmax()

'Walmart'

In [16]:
fortune500.loc[fortune500.idxmax()]

np.int64(2300000)

In [13]:
minEployees = fortune500.idxmin()
minEployees

'A-Mark Precious Metals'

In [14]:
fortune500.loc[minEployees]

np.int64(83)

# Modifying Data Series

In [18]:
surveys = pd.read_csv('data/data/StackOverflowDeveloperSurvey.csv', usecols=['Salary']).squeeze().dropna()
surveys.head()

2     113750.0
14    100000.0
17    130000.0
18     82500.0
22    100764.0
Name: Salary, dtype: float64

In [20]:
surveys.count()
len(surveys)

12891

In [22]:
surveysIncrease = surveys* 1.03
surveysIncrease.head()

2     117162.50
14    103000.00
17    133900.00
18     84975.00
22    103786.92
Name: Salary, dtype: float64

In [27]:
surveysTime = pd.read_csv('data/data/StackOverflowDeveloperSurvey2018.csv', low_memory=False,
                              usecols=['HoursOutside']).squeeze().dropna()
surveysTime.head()

0             1 - 2 hours
1         30 - 59 minutes
3    Less than 30 minutes
4             1 - 2 hours
5         30 - 59 minutes
Name: HoursOutside, dtype: object

In [39]:
surveysTime.value_counts()

HoursOutside
1 - 2 HOURS             27788
30 - 59 MINUTES         24002
LESS THAN 30 MINUTES    11223
3 - 4 HOURS              7186
OVER 4 HOURS             1825
Name: count, dtype: int64

In [29]:
surveysTime = surveysTime.str.lower()
surveysTime.head()

0             1 - 2 hours
1         30 - 59 minutes
3    less than 30 minutes
4             1 - 2 hours
5         30 - 59 minutes
Name: HoursOutside, dtype: object

In [51]:
surveysTime = surveysTime.apply(lambda time: time.upper())
surveysTime.head()

0            1 - 2 HOURS
1        30 - 59 MINUTES
3    LESS THAN HALF HOUR
4            1 - 2 HOURS
5        30 - 59 MINUTES
Name: HoursOutside, dtype: object

In [40]:
def change_description(text: str):
    if text=='LESS THAN 30 MINUTES':
        return 'LESS THAN HALF HOUR'
    else:
        return text
surveysTime = surveysTime.apply(change_description)
surveysTime.head()

0            1 - 2 HOURS
1        30 - 59 MINUTES
3    LESS THAN HALF HOUR
4            1 - 2 HOURS
5        30 - 59 MINUTES
Name: HoursOutside, dtype: object

# Map method on Data Series

In [54]:
training_dict = {
    'PYT001': 'Airbus 320',
    'PYT002': 'Boeing 737',
    'PYT003': 'Airbus 321',
}
airCrafts = pd.Series(training_dict)
airCrafts

PYT001    Airbus 320
PYT002    Boeing 737
PYT003    Airbus 321
dtype: object

In [55]:
flightsList = [rnd.choice(airCrafts.index) for _ in range(100)]

In [56]:
flightsList[:5]

['PYT002', 'PYT003', 'PYT003', 'PYT001', 'PYT003']

In [58]:
flights = pd.Series(flightsList)
flights.head()

0    PYT002
1    PYT003
2    PYT003
3    PYT001
4    PYT003
dtype: object

In [2]:
flightsAirCrafts =flights.map(airCrafts)
flightsAirCrafts.head()

NameError: name 'flights' is not defined

In [60]:
flightsAirCrafts.value_counts()

Airbus 320    38
Boeing 737    31
Airbus 321    31
Name: count, dtype: int64