## introduction to numpy

- applying the numpy fundamentals on the NYC airport dataset taxi dataset
- here the data set columns

- pickup_month:  The month of the trip (January is 1, December is 12).
- pickup_day:  The day of the month of the trip.
- pickup_location_code:  The airport or borough where the trip started.
- dropoff_location_code: The airport or borough where the trip finished.
- trip_distance: The distance of the trip in miles.
- trip_length: The length of the trip in seconds.
- fare_amount: The base fare of the trip, in dollars.
- total_amount: The total amount charged to the passenger, including all fees, tolls and tips. 

In [1]:
import numpy as np 
import csv

In [2]:
#open the dataset using csv module and then convert the dtaset into nd array using numpy 

opened_file = open("nyc_taxis.csv")
file_as_list = list(csv.reader(opened_file))
header = file_as_list[0]
data = file_as_list[1:]
print(header)


['pickup_year', 'pickup_month', 'pickup_day', 'pickup_dayofweek', 'pickup_time', 'pickup_location_code', 'dropoff_location_code', 'trip_distance', 'trip_length', 'fare_amount', 'fees_amount', 'tolls_amount', 'tip_amount', 'total_amount', 'payment_type']


In [3]:
a = np.zeros((3,3), np.int32)
a[a == 0] = 5

a[1][a[1 , 2] == 5] = 4
a

array([[5, 5, 5],
       [4, 4, 4],
       [5, 5, 5]])

In [4]:
#before converting the dataset notice that
#you need to convert the data from string to numeric values first

converted_data = []

for row in data:
    converted_row = []
    for i in row:
        i = float(i)
        converted_row.append(i)
    converted_data.append(converted_row)
    
#convert the data into numpy 2D array now using the numpy.array() method
taxi = np.array(converted_data)
#show some rows
print(taxi[0:4])

[[2.016e+03 1.000e+00 1.000e+00 5.000e+00 0.000e+00 2.000e+00 4.000e+00
  2.100e+01 2.037e+03 5.200e+01 8.000e-01 5.540e+00 1.165e+01 6.999e+01
  1.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 5.000e+00 0.000e+00 2.000e+00 1.000e+00
  1.629e+01 1.520e+03 4.500e+01 1.300e+00 0.000e+00 8.000e+00 5.430e+01
  1.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 5.000e+00 0.000e+00 2.000e+00 6.000e+00
  1.270e+01 1.462e+03 3.650e+01 1.300e+00 0.000e+00 0.000e+00 3.780e+01
  2.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 5.000e+00 0.000e+00 2.000e+00 6.000e+00
  8.700e+00 1.210e+03 2.600e+01 1.300e+00 0.000e+00 5.460e+00 3.276e+01
  1.000e+00]]


In [5]:
# show some information about the dataset
print("the type of the data now is: ",type(taxi))

#print(number of rows and columns)
print("the shape of the matrix is: ", taxi.shape)
print("the type of the array elements is : ", taxi.dtype)

the type of the data now is:  <class 'numpy.ndarray'>
the shape of the matrix is:  (89560, 15)
the type of the array elements is :  float64


In [6]:
#show some data elements or 2d array like:
print(taxi[0:3 , 3:6])

[[5. 0. 2.]
 [5. 0. 2.]
 [5. 0. 2.]]


In [7]:
#show some days in the month 
print("some days in the month : ", taxi[[50,2000,2500], 2])

some days in the month :  [1. 5. 6.]


In [8]:
#if u wanna see the pick up year try this:
print("the pick up year is:" , taxi[:, 0])

the pick up year is: [2016. 2016. 2016. ... 2016. 2016. 2016.]


In [9]:
# if u wanna know the minimum fare amount try this:
print("the minumum fare amount is :" , taxi[:,9].min() , "it is negative amount it is not good at all we should clean it")

the minumum fare amount is : -52.0 it is negative amount it is not good at all we should clean it


In [10]:
# u can know also yhe maximum amount for some rows in the dataset easily:
print("the max values for row [2,3,5] are: ", taxi[[2,3,5]].max())

the max values for row [2,3,5] are:  2016.0


In [11]:
#what is the length of the taxi dataset ? 
print("the length of the dataset is {} rows:".format(len(taxi)))

the length of the dataset is 89560 rows:


In [12]:
#multiplication for 2 columns:
pickup_month = taxi[:, 9]
pickup_day = taxi[:,8]

product = pickup_month * pickup_day
print(product, "length of the product", len(product))

[105924.   68400.   53363.  ... 146744.   37363.5  82128. ] length of the product 89560


## calculate the miles per hour for each trip

In [13]:
length_by_hour = taxi[:,8] / 3600
miles = taxi[:,7]

speed= miles / length_by_hour
print(speed[:5])

[37.11340206 38.58157895 31.27222982 25.88429752 26.3715415 ]


In [14]:
# max speed out of all trips
print("the max speed for a trip is: ",speed.max())

the max speed for a trip is:  82800.0


- here it is impossible speed it faster than the fastest plane on the planet 82800

- total amount equals all the amounts of other [9:12] columns

In [15]:
total_amount = taxi[:, 13]
sum_amounts = taxi[:, 9:13].sum(axis = 1)

print(total_amount[:5])
print(sum_amounts[:5])

[69.99 54.3  37.8  32.76 18.8 ]
[69.99 54.3  37.8  32.76 18.8 ]


In [16]:
np.arange(5)

array([0, 1, 2, 3, 4])

### boolean indexing with boolean arrays

- open file using numpy

In [17]:
taxi = np.genfromtxt("nyc_taxis.csv" , delimiter="," , skip_header= 1 )
print(taxi[0])

[2.016e+03 1.000e+00 1.000e+00 5.000e+00 0.000e+00 2.000e+00 4.000e+00
 2.100e+01 2.037e+03 5.200e+01 8.000e-01 5.540e+00 1.165e+01 6.999e+01
 1.000e+00]


- starting boolean indexing

In [18]:
arr = np.array([1,2,3,5,20,35,15])
print(arr > 10)
print(arr[arr > 10])

[False False False False  True  True  True]
[20 35 15]


In [19]:
# apply for january month
pickup_month = taxi[:,1]
cond = pickup_month == 1
jan = pickup_month[cond]
print(jan)

[1. 1. 1. ... 1. 1. 1.]


In [20]:
february = pickup_month[taxi[:,1] == 2]
print(february)

[2. 2. 2. ... 2. 2. 2.]


- calculate the top tips for the tips column

In [21]:
tips = taxi[:,12]
bool_1 = taxi[:,12] > 50
top = tips[bool_1]
print(top)

[ 52.8   60.    59.34  80.    70.    60.    55.    65.    80.    62.
 100.    58.    62.    75.7   60.    70.  ]


In [22]:
#top fares 
print("top fares are \n ", taxi[:,9][taxi[:,9]>100])

top fares are 
  [109.5 122.  117.  112.  114.5 102.  105.5 110.5 110.5 101.5 128.  134.5
 102.5 113.  102.5 116.  110.5 119.5 106.5 110.  105.5 113.  115.5 106.5
 120.  121.  100.5 130.  123.  110.5 106.5 400.  116.5 125.  157.5 103.
 120.5 104.  101.  101.  101.  115.5 129.5 112.  106.  114.5 107.  119.
 126.  115.  123.  111.  110.  102.5 110.  126.  112.5 101.5 220.  108.
 150.  101.  134.  101.  113.  180.5 104.  129.  107.5 120.  112.  117.5
 114.5 116.  117.5 116.5 113.5]


In [23]:
#total highest fees
total_amount = taxi[:,13]
cond = total_amount > 200
print(total_amount[cond])

[400.3  453.34 220.3  834.84 286.84]


### assigning values using boolean indexing

In [24]:
#change the maximum fees to be 850 dollr 
taxi[:,13][taxi[:,13] == 834.84] = 850
print(taxi[:,13].max())

850.0


In [25]:
x = np.zeros((5,5))
x[0,1] = 10
x[:2, 3] = 5
x[:,2] = [1,5,20,15,2]
x[4,0] = 50


In [26]:
x[x==50] = 48
x

array([[ 0., 10.,  1.,  5.,  0.],
       [ 0.,  0.,  5.,  5.,  0.],
       [ 0.,  0., 20.,  0.,  0.],
       [ 0.,  0., 15.,  0.,  0.],
       [48.,  0.,  2.,  0.,  0.]])

In [27]:
taxi_copy = taxi.copy()
total_amount = taxi_copy[:,13]
total_amount[total_amount <0] = 0
total_amount[total_amount == 0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.])

In [28]:
bool_1 = taxi_copy[:,13] > 200
taxi_copy[bool_1 , 13] = 500
taxi_copy[:,13][taxi[:, 13]>400]

array([500., 500., 500.])

- create new columns using boolean indexing

In [29]:
col = np.zeros((89560, 1))
np.concatenate([taxi_copy , col], axis = 1)

array([[2.016e+03, 1.000e+00, 1.000e+00, ..., 6.999e+01, 1.000e+00,
        0.000e+00],
       [2.016e+03, 1.000e+00, 1.000e+00, ..., 5.430e+01, 1.000e+00,
        0.000e+00],
       [2.016e+03, 1.000e+00, 1.000e+00, ..., 3.780e+01, 2.000e+00,
        0.000e+00],
       ...,
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 6.334e+01, 1.000e+00,
        0.000e+00],
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 4.475e+01, 1.000e+00,
        0.000e+00],
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 5.484e+01, 2.000e+00,
        0.000e+00]])

# introduction to pandas

## write the data dictionary first of all

# working with pandas

In [30]:
import pandas as pd
f500 = pd.read_csv("f500.csv" , index_col = 0)
f500.head()

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798
State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456
Sinopec Group,3,267518,-9.1,1257.9,310726,-65.0,Wang Yupu,Petroleum Refining,Energy,4,China,"Beijing, China",http://www.sinopec.com,19,713288,106523
China National Petroleum,4,262573,-12.3,1867.5,585619,-73.7,Zhang Jianhua,Petroleum Refining,Energy,3,China,"Beijing, China",http://www.cnpc.com.cn,17,1512048,301893
Toyota Motor,5,254694,7.7,16899.3,437575,-12.3,Akio Toyoda,Motor Vehicles and Parts,Motor Vehicles & Parts,8,Japan,"Toyota, Japan",http://www.toyota-global.com,23,364445,157210


In [31]:
f500.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, Walmart to AutoNation
Data columns (total 16 columns):
rank                        500 non-null int64
revenues                    500 non-null int64
revenue_change              498 non-null float64
profits                     499 non-null float64
assets                      500 non-null int64
profit_change               436 non-null float64
ceo                         500 non-null object
industry                    500 non-null object
sector                      500 non-null object
previous_rank               500 non-null int64
country                     500 non-null object
hq_location                 500 non-null object
website                     500 non-null object
years_on_global_500_list    500 non-null int64
employees                   500 non-null int64
total_stockholder_equity    500 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 66.4+ KB


In [32]:
f500.describe(include = "O")

Unnamed: 0,ceo,industry,sector,country,hq_location,website
count,500,500,500,500,500,500
unique,500,58,21,34,235,500
top,Matthias Muller,Banks: Commercial and Savings,Financials,USA,"Beijing, China",http://www.commbank.com.au
freq,1,51,118,132,56,1


In [33]:
f500.shape

(500, 16)

In [34]:
f500.tail(3)

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Wm. Morrison Supermarkets,498,21741,-11.3,406.4,11630,20.4,David T. Potts,Food and Drug Stores,Food & Drug Stores,437,Britain,"Bradford, Britain",http://www.morrisons.com,13,77210,5111
TUI,499,21655,-5.5,1151.7,16247,195.5,Friedrich Joussen,Travel Services,Business Services,467,Germany,"Hanover, Germany",http://www.tuigroup.com,23,66779,3006
AutoNation,500,21609,3.6,430.5,10060,-2.7,Michael J. Jackson,Specialty Retailers,Retailing,0,USA,"Fort Lauderdale, FL",http://www.autonation.com,12,26000,2310


In [35]:
f500.dtypes

rank                          int64
revenues                      int64
revenue_change              float64
profits                     float64
assets                        int64
profit_change               float64
ceo                          object
industry                     object
sector                       object
previous_rank                 int64
country                      object
hq_location                  object
website                      object
years_on_global_500_list      int64
employees                     int64
total_stockholder_equity      int64
dtype: object

In [36]:
f500.dtypes["profits"]

dtype('float64')

In [37]:
#select rows
f_row = f500.loc["Walmart"]
f_row

rank                                             1
revenues                                    485873
revenue_change                                 0.8
profits                                      13643
assets                                      198825
profit_change                                 -7.2
ceo                            C. Douglas McMillon
industry                     General Merchandisers
sector                                   Retailing
previous_rank                                    1
country                                        USA
hq_location                        Bentonville, AR
website                     http://www.walmart.com
years_on_global_500_list                        23
employees                                  2300000
total_stockholder_equity                     77798
Name: Walmart, dtype: object

In [38]:
f500.iloc[0]

rank                                             1
revenues                                    485873
revenue_change                                 0.8
profits                                      13643
assets                                      198825
profit_change                                 -7.2
ceo                            C. Douglas McMillon
industry                     General Merchandisers
sector                                   Retailing
previous_rank                                    1
country                                        USA
hq_location                        Bentonville, AR
website                     http://www.walmart.com
years_on_global_500_list                        23
employees                                  2300000
total_stockholder_equity                     77798
Name: Walmart, dtype: object

In [39]:
f500.loc['Walmart': "State Grid"]

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798
State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456


In [40]:
f500.iloc[0:1]

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798


In [41]:
f500.loc[:, "ceo":]

Unnamed: 0_level_0,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Walmart,C. Douglas McMillon,General Merchandisers,Retailing,1,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798
State Grid,Kou Wei,Utilities,Energy,2,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456
Sinopec Group,Wang Yupu,Petroleum Refining,Energy,4,China,"Beijing, China",http://www.sinopec.com,19,713288,106523
China National Petroleum,Zhang Jianhua,Petroleum Refining,Energy,3,China,"Beijing, China",http://www.cnpc.com.cn,17,1512048,301893
Toyota Motor,Akio Toyoda,Motor Vehicles and Parts,Motor Vehicles & Parts,8,Japan,"Toyota, Japan",http://www.toyota-global.com,23,364445,157210
...,...,...,...,...,...,...,...,...,...,...
Teva Pharmaceutical Industries,Yitzhak Peterburg,Pharmaceuticals,Health Care,0,Israel,"Petach Tikva, Israel",http://www.tevapharm.com,1,56960,33337
New China Life Insurance,Wan Feng,"Insurance: Life, Health (stock)",Financials,427,China,"Beijing, China",http://www.newchinalife.com,2,54378,8507
Wm. Morrison Supermarkets,David T. Potts,Food and Drug Stores,Food & Drug Stores,437,Britain,"Bradford, Britain",http://www.morrisons.com,13,77210,5111
TUI,Friedrich Joussen,Travel Services,Business Services,467,Germany,"Hanover, Germany",http://www.tuigroup.com,23,66779,3006


In [42]:
f500.loc[:, ["ceo", "previous_rank"]]

Unnamed: 0_level_0,ceo,previous_rank
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Walmart,C. Douglas McMillon,1
State Grid,Kou Wei,2
Sinopec Group,Wang Yupu,4
China National Petroleum,Zhang Jianhua,3
Toyota Motor,Akio Toyoda,8
...,...,...
Teva Pharmaceutical Industries,Yitzhak Peterburg,0
New China Life Insurance,Wan Feng,427
Wm. Morrison Supermarkets,David T. Potts,437
TUI,Friedrich Joussen,467


In [43]:
pd.Series([3,5,25,26])[2]

25

In [44]:
f500.loc[:, ["rank" , "revenue_change" , "profits"]]

Unnamed: 0_level_0,rank,revenue_change,profits
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Walmart,1,0.8,13643.0
State Grid,2,-4.4,9571.3
Sinopec Group,3,-9.1,1257.9
China National Petroleum,4,-12.3,1867.5
Toyota Motor,5,7.7,16899.3
...,...,...,...
Teva Pharmaceutical Industries,496,11.5,329.0
New China Life Insurance,497,-13.3,743.9
Wm. Morrison Supermarkets,498,-11.3,406.4
TUI,499,-5.5,1151.7


In [45]:
f500.loc[["State Grid" , "China National Petroleum"]]

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456
China National Petroleum,4,262573,-12.3,1867.5,585619,-73.7,Zhang Jianhua,Petroleum Refining,Energy,3,China,"Beijing, China",http://www.cnpc.com.cn,17,1512048,301893


In [46]:
# Series
f500["rank"]

company
Walmart                             1
State Grid                          2
Sinopec Group                       3
China National Petroleum            4
Toyota Motor                        5
                                 ... 
Teva Pharmaceutical Industries    496
New China Life Insurance          497
Wm. Morrison Supermarkets         498
TUI                               499
AutoNation                        500
Name: rank, Length: 500, dtype: int64

In [47]:
rank = f500.loc[:, "rank"]
print(type(rank))

<class 'pandas.core.series.Series'>


In [48]:
pd.Series([1,20,50,55,32])

0     1
1    20
2    50
3    55
4    32
dtype: int64

In [49]:
a = f500.iloc[:, 5]
a.value_counts(dropna = False)

 NaN     64
 0.9      6
 4.5      4
-1.8      4
 3.7      4
         ..
-53.4     1
 31.1     1
-33.6     1
 30.1     1
-65.0     1
Name: profit_change, Length: 369, dtype: int64

In [50]:
country = f500.loc[:, "country"]
country.describe()

count     500
unique     34
top       USA
freq      132
Name: country, dtype: object

In [51]:
country.value_counts()

USA             132
China           109
Japan            51
Germany          29
France           29
Britain          24
South Korea      15
Switzerland      14
Netherlands      14
Canada           11
Spain             9
India             7
Brazil            7
Italy             7
Australia         7
Taiwan            6
Russia            4
Ireland           4
Sweden            3
Singapore         3
Mexico            2
Denmark           1
Venezuela         1
Norway            1
U.A.E             1
Luxembourg        1
Indonesia         1
Thailand          1
Saudi Arabia      1
Turkey            1
Israel            1
Belgium           1
Finland           1
Malaysia          1
Name: country, dtype: int64

In [52]:
f500[f500["country"] == "Israel"] = np.nan

In [53]:
country.value_counts(dropna = False)

USA             132
China           109
Japan            51
France           29
Germany          29
Britain          24
South Korea      15
Switzerland      14
Netherlands      14
Canada           11
Spain             9
Australia         7
India             7
Brazil            7
Italy             7
Taiwan            6
Russia            4
Ireland           4
Sweden            3
Singapore         3
Mexico            2
Luxembourg        1
Malaysia          1
Venezuela         1
Norway            1
U.A.E             1
Denmark           1
Finland           1
Indonesia         1
Saudi Arabia      1
Turkey            1
Thailand          1
Belgium           1
NaN               1
Name: country, dtype: int64

In [54]:
f500.loc[:, "country"].value_counts()

USA             132
China           109
Japan            51
Germany          29
France           29
Britain          24
South Korea      15
Switzerland      14
Netherlands      14
Canada           11
Spain             9
India             7
Brazil            7
Italy             7
Australia         7
Taiwan            6
Russia            4
Ireland           4
Singapore         3
Sweden            3
Mexico            2
Denmark           1
Finland           1
Belgium           1
Thailand          1
Turkey            1
Saudi Arabia      1
Indonesia         1
Luxembourg        1
U.A.E             1
Norway            1
Venezuela         1
Malaysia          1
Name: country, dtype: int64

In [55]:
ceo = f500.loc[:, "ceo"]
ceo.value_counts()

Matthias Muller        1
Michael J. Jackson     1
Philippe Wahl          1
Olivier Brandicourt    1
Ryuichi Isaka          1
                      ..
Andrew G. Thorburn     1
Michael L. Corbat      1
Liu Hualong            1
Mark T. Bertolini      1
Wesley G. Bush         1
Name: ceo, Length: 499, dtype: int64

In [56]:
ceo.describe()

count                 499
unique                499
top       Matthias Muller
freq                    1
Name: ceo, dtype: object

In [57]:
ceo.value_counts()["Ryuichi Isaka"]

1

In [58]:
ceo[50:60]

company
China Life Insurance                Yang Mingsheng
BMW Group                            Harald Kruger
Express Scripts Holding       Timothy C. Wentworth
Trafigura Group                        Jeremy Weir
China Railway Engineering            Zhang Zongyan
Prudential                        Michael A. Wells
Assicurazioni Generali          Philippe R. Donnet
China Railway Construction           Meng Fengchao
Home Depot                         Craig A. Menear
Boeing                        Dennis A. Muilenburg
Name: ceo, dtype: object

In [59]:
ceo.loc["BMW Group" : "Home Depot"]

company
BMW Group                            Harald Kruger
Express Scripts Holding       Timothy C. Wentworth
Trafigura Group                        Jeremy Weir
China Railway Engineering            Zhang Zongyan
Prudential                        Michael A. Wells
Assicurazioni Generali          Philippe R. Donnet
China Railway Construction           Meng Fengchao
Home Depot                         Craig A. Menear
Name: ceo, dtype: object

In [60]:
f500.loc["BMW Group" : "Home Depot" , "rank" : "ceo"]

Unnamed: 0_level_0,rank,revenues,revenue_change,profits,assets,profit_change,ceo
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BMW Group,52.0,104130.0,1.8,7589.4,198835.0,7.4,Harald Kruger
Express Scripts Holding,53.0,100288.0,-1.4,3404.4,51745.0,37.5,Timothy C. Wentworth
Trafigura Group,54.0,98098.0,0.9,750.8,41230.0,-39.3,Jeremy Weir
China Railway Engineering,55.0,96979.0,-2.5,924.1,108864.0,-6.0,Zhang Zongyan
Prudential,56.0,96965.0,53.7,2592.8,581221.0,-34.2,Michael A. Wells
Assicurazioni Generali,57.0,95217.0,-7.2,2301.3,549656.0,2.2,Philippe R. Donnet
China Railway Construction,58.0,94877.0,-0.8,1192.4,109968.0,7.8,Meng Fengchao
Home Depot,59.0,94595.0,6.9,7957.0,42966.0,13.5,Craig A. Menear


In [61]:
big_movers = f500.loc[["Aviva" , "HP", "JD.com"]] , [["rank", "previous_rank"]]
bottom_companies = f500.loc["National Grid" : "AutoNation" , ["rank" , "sector", "country"]]

In [62]:
rank = f500.loc[:, "rank"]
prev_rank = f500.loc[:, "previous_rank"]
diff = prev_rank - rank 
diff

company
Walmart                             0.0
State Grid                          0.0
Sinopec Group                       1.0
China National Petroleum           -1.0
Toyota Motor                        3.0
                                  ...  
Teva Pharmaceutical Industries      NaN
New China Life Insurance          -70.0
Wm. Morrison Supermarkets         -61.0
TUI                               -32.0
AutoNation                       -500.0
Length: 500, dtype: float64

- 0 value means the compan's rank did not changed
- negative value (-) means the compan's rank has been gone down
- positive value (+) means the compan's rnk has been raised

In [63]:
#data exploration methods
f500.max()

rank                            500.0
revenues                     485873.0
revenue_change                  442.3
profits                       45687.0
assets                      3473238.0
profit_change                  8909.5
previous_rank                   500.0
years_on_global_500_list         23.0
employees                   2300000.0
total_stockholder_equity     301893.0
dtype: float64

In [64]:
rank_change_max = diff.max()
print(rank_change_max)


226.0


In [65]:
print("the company with the max rank exchange is  : \n", f500.loc[diff == 226, "country"])

the company with the max rank exchange is  : 
 company
Centene    USA
Name: country, dtype: object


In [66]:
print("the company with the minimum rank exchange is :\n", f500.loc[diff == diff.min() , "country"])
#show the value of minimum ank exchange
print(diff.min())

the company with the minimum rank exchange is :
 company
AutoNation    USA
Name: country, dtype: object
-500.0


- the minimum value of the rank exchange here is -500 but it is wrong value because the minimum value you should reach here is when you go down from 1 to 500 (1-500 = 499) it is supposed to -499  this indicates that we have incorrect data in either the rank column or the previous_rank column

In [67]:
print("statistics about rank column: \n" , f500.loc[:, "rank"].describe())

statistics about rank column: 
 count    499.000000
mean     250.008016
std      144.206971
min        1.000000
25%      125.500000
50%      250.000000
75%      374.500000
max      500.000000
Name: rank, dtype: float64


In [68]:
print("some statistics about the ceo Series :\n" , f500.loc[:, "ceo"].describe())

some statistics about the ceo Series :
 count                 499
unique                499
top       Matthias Muller
freq                    1
Name: ceo, dtype: object


In [69]:
print(f500["previous_rank"].describe())

count    499.000000
mean     222.579158
std      146.751543
min        0.000000
25%       93.500000
50%      220.000000
75%      347.500000
max      500.000000
Name: previous_rank, dtype: float64


In [70]:
f500.loc[:, "previous_rank"].value_counts(dropna = True)

0.0      32
471.0     1
234.0     1
125.0     1
166.0     1
         ..
191.0     1
369.0     1
179.0     1
161.0     1
1.0       1
Name: previous_rank, Length: 468, dtype: int64

In [71]:
f500.max(axis = 0)

rank                            500.0
revenues                     485873.0
revenue_change                  442.3
profits                       45687.0
assets                      3473238.0
profit_change                  8909.5
previous_rank                   500.0
years_on_global_500_list         23.0
employees                   2300000.0
total_stockholder_equity     301893.0
dtype: float64

In [72]:
f500.max(axis = 1)

company
Walmart                           2300000.0
State Grid                         926067.0
Sinopec Group                      713288.0
China National Petroleum          1512048.0
Toyota Motor                       437575.0
                                    ...    
Teva Pharmaceutical Industries          NaN
New China Life Insurance           100609.0
Wm. Morrison Supermarkets           77210.0
TUI                                 66779.0
AutoNation                          26000.0
Length: 500, dtype: float64

In [73]:
f500.max(numeric_only= True)

rank                            500.0
revenues                     485873.0
revenue_change                  442.3
profits                       45687.0
assets                      3473238.0
profit_change                  8909.5
previous_rank                   500.0
years_on_global_500_list         23.0
employees                   2300000.0
total_stockholder_equity     301893.0
dtype: float64

In [74]:
f500.describe()

Unnamed: 0,rank,revenues,revenue_change,profits,assets,profit_change,previous_rank,years_on_global_500_list,employees,total_stockholder_equity
count,499.0,499.0,497.0,498.0,499.0,435.0,499.0,499.0,499.0,499.0
mean,250.008016,55483.519038,4.524346,3060.67751,243934.4,24.390575,222.579158,15.064128,134152.7,30622.647295
std,144.206971,45746.672314,28.576119,5175.734323,485633.5,437.985104,146.751543,7.915714,170223.4,43686.203717
min,1.0,21609.0,-67.3,-13038.0,3717.0,-793.7,0.0,1.0,328.0,-59909.0
25%,125.5,29093.0,-5.9,562.775,36584.0,-22.65,93.5,7.0,42727.0,7552.5
50%,250.0,40238.0,0.5,1763.1,72985.0,-0.3,220.0,17.0,93123.0,15724.0
75%,374.5,64212.5,6.9,3968.0,180628.0,18.0,347.5,23.0,169002.5,37977.0
max,500.0,485873.0,442.3,45687.0,3473238.0,8909.5,500.0,23.0,2300000.0,301893.0


In [75]:
f500.describe(include = "O")

Unnamed: 0,ceo,industry,sector,country,hq_location,website
count,499,499,499,499,499,499
unique,499,58,21,33,234,499
top,Matthias Muller,Banks: Commercial and Savings,Financials,USA,"Beijing, China",http://www.commbank.com.au
freq,1,51,118,132,56,1


In [76]:
print(type(f500.loc[:, "rank"].describe()))

<class 'pandas.core.series.Series'>


In [77]:
f500.describe(include = ["O"])

Unnamed: 0,ceo,industry,sector,country,hq_location,website
count,499,499,499,499,499,499
unique,499,58,21,33,234,499
top,Matthias Muller,Banks: Commercial and Savings,Financials,USA,"Beijing, China",http://www.commbank.com.au
freq,1,51,118,132,56,1


In [78]:
#pandas assignment 
c = f500.copy()
c["rank"] = 0
print(c["rank"])
c["rank"] = range(1,501)


company
Walmart                           0
State Grid                        0
Sinopec Group                     0
China National Petroleum          0
Toyota Motor                      0
                                 ..
Teva Pharmaceutical Industries    0
New China Life Insurance          0
Wm. Morrison Supermarkets         0
TUI                               0
AutoNation                        0
Name: rank, Length: 500, dtype: int64


In [79]:
print(c["rank"])

company
Walmart                             1
State Grid                          2
Sinopec Group                       3
China National Petroleum            4
Toyota Motor                        5
                                 ... 
Teva Pharmaceutical Industries    496
New China Life Insurance          497
Wm. Morrison Supermarkets         498
TUI                               499
AutoNation                        500
Name: rank, Length: 500, dtype: int32


In [80]:
c.loc["Walmart" , "ceo"] = "Eslam Hosam"
c.loc["Walmart" , "ceo"]

'Eslam Hosam'

In [81]:
#top rank companies
top_5 = f500.loc[:, ["rank" , "revenues"]]
top_5.head()

Unnamed: 0_level_0,rank,revenues
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Walmart,1.0,485873.0
State Grid,2.0,315199.0
Sinopec Group,3.0,267518.0
China National Petroleum,4.0,262573.0
Toyota Motor,5.0,254694.0


In [82]:
top_5.loc["Sinopec Group" , "rank"] = 30
top_5.loc["Sinopec Group"]

rank            30.0
revenues    267518.0
Name: Sinopec Group, dtype: float64

In [83]:
f500.loc["Dow Chemical" , "ceo"] = "Jim Fitterling"
print("the ceo of Dow Chemicals is {}".format(f500.loc["Dow Chemical" , "ceo"]))

the ceo of Dow Chemicals is Jim Fitterling


In [84]:
f500.loc["Walmart" , "ceo"] = "Eslam Hosam"
print("the ceo of Walmart now is {}".format(f500.loc["Walmart" , "ceo"]))

the ceo of Walmart now is Eslam Hosam


In [85]:
df = pd.DataFrame([["kyle" , 12], ["esoo" , 8] , ["hamoo" , 8], ["mada" , 5] , ["mada2" , 12]] , columns=["a" , "b"])
df

Unnamed: 0,a,b
0,kyle,12
1,esoo,8
2,hamoo,8
3,mada,5
4,mada2,12


In [86]:
df[df["b"] == 8]

Unnamed: 0,a,b
1,esoo,8
2,hamoo,8


In [87]:
print(df[df["a"] == "esoo"])

      a  b
1  esoo  8


In [88]:
df[df["b"] == 12]

Unnamed: 0,a,b
0,kyle,12
4,mada2,12


In [89]:
df.loc[df["a"] == "esoo" , "b"]

1    8
Name: b, dtype: int64

In [90]:
df.loc[df["a"] == "mada2" , "b"]

4    12
Name: b, dtype: int64

In [91]:
c1 = df["a"] == "mada"
c2 = df["a"] == "mada2"
df[c1]

Unnamed: 0,a,b
3,mada,5


In [92]:
bool_1 = f500["industry"] == "Motor Vehicles and Parts"
motor_countries = f500.loc[bool_1 , "country"]

In [93]:
f500.loc[f500.loc[:,"sector"] == 'Motor Vehicles & Parts', "sector" ] = 'Motor Vehicles and Parts'

f500.loc[f500.loc[:,"sector"] == 'Motor Vehicles and Parts', "sector" ]

company
Toyota Motor                           Motor Vehicles and Parts
Volkswagen                             Motor Vehicles and Parts
Daimler                                Motor Vehicles and Parts
General Motors                         Motor Vehicles and Parts
Ford Motor                             Motor Vehicles and Parts
Honda Motor                            Motor Vehicles and Parts
SAIC Motor                             Motor Vehicles and Parts
Nissan Motor                           Motor Vehicles and Parts
BMW Group                              Motor Vehicles and Parts
Dongfeng Motor                         Motor Vehicles and Parts
Robert Bosch                           Motor Vehicles and Parts
Hyundai Motor                          Motor Vehicles and Parts
China FAW Group                        Motor Vehicles and Parts
Beijing Automotive Group               Motor Vehicles and Parts
Peugeot                                Motor Vehicles and Parts
Renault                         

In [94]:
f500.loc[f500["industry"] == "Motor Vehicles and Parts" , "country"] = "Egypt"

In [95]:
f500.loc[f500["industry"] == "Motor Vehicles and Parts" , 'country'].value_counts()

Egypt    34
Name: country, dtype: int64

In [96]:
f500.loc[f500["previous_rank"] == 0 , "previous_rank"] = np.nan
(f500["previous_rank"] == 50).value_counts()

False    499
True       1
Name: previous_rank, dtype: int64

In [97]:
f500["previous_rank"].value_counts(dropna = False)

NaN      33
471.0     1
234.0     1
125.0     1
166.0     1
         ..
191.0     1
369.0     1
179.0     1
161.0     1
1.0       1
Name: previous_rank, Length: 468, dtype: int64

In [98]:
f500["hg"] = 20
print(f500["hg"])

company
Walmart                           20
State Grid                        20
Sinopec Group                     20
China National Petroleum          20
Toyota Motor                      20
                                  ..
Teva Pharmaceutical Industries    20
New China Life Insurance          20
Wm. Morrison Supermarkets         20
TUI                               20
AutoNation                        20
Name: hg, Length: 500, dtype: int64


In [99]:
f500["difference"] = f500["profits"] - f500["assets"]
f500["difference"]

company
Walmart                          -185182.0
State Grid                       -480266.7
Sinopec Group                    -309468.1
China National Petroleum         -583751.5
Toyota Motor                     -420675.7
                                    ...   
Teva Pharmaceutical Industries         NaN
New China Life Insurance          -99865.1
Wm. Morrison Supermarkets         -11223.6
TUI                               -15095.3
AutoNation                         -9629.5
Name: difference, Length: 500, dtype: float64

In [100]:
f500.shape

(500, 18)

- now we have 18 columns because we have added more columns to our dataset

- challenge top performance by country

In [101]:
top_2 = f500["country"].value_counts().head(2)
top_2

USA      130
China    102
Name: country, dtype: int64

- Create a series, industry_usa, containing counts of the two most common values in the industry column for companies headquartered in the USA.

In [102]:
top_industry_usa = f500.loc[f500["country"] == "USA", "industry"].value_counts().head(2)
print("USA top industries are :\n {} ".format(top_industry_usa))

USA top industries are :
 Banks: Commercial and Savings               8
Insurance: Property and Casualty (Stock)    7
Name: industry, dtype: int64 


- Create a series, sector_china, containing counts of the three most common values in the sector column for companies headquartered in the China.

In [103]:
series_china = f500.loc[f500["country"] == "China" , "sector"].value_counts().head()
series_china

Financials                    25
Energy                        22
Wholesalers                    9
Engineering & Construction     8
Technology                     8
Name: sector, dtype: int64

# mission 5

In [104]:
# select some rows
f500_selection = f500.loc[:, ["revenues", "revenue_change"]]
f500_selection

Unnamed: 0_level_0,revenues,revenue_change
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Walmart,485873.0,0.8
State Grid,315199.0,-4.4
Sinopec Group,267518.0,-9.1
China National Petroleum,262573.0,-12.3
Toyota Motor,254694.0,7.7
...,...,...
Teva Pharmaceutical Industries,,
New China Life Insurance,21796.0,-13.3
Wm. Morrison Supermarkets,21741.0,-11.3
TUI,21655.0,-5.5


In [105]:
f500 = pd.read_csv("f500.csv")
f500.head()

Unnamed: 0,company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
0,Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798
1,State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456
2,Sinopec Group,3,267518,-9.1,1257.9,310726,-65.0,Wang Yupu,Petroleum Refining,Energy,4,China,"Beijing, China",http://www.sinopec.com,19,713288,106523
3,China National Petroleum,4,262573,-12.3,1867.5,585619,-73.7,Zhang Jianhua,Petroleum Refining,Energy,3,China,"Beijing, China",http://www.cnpc.com.cn,17,1512048,301893
4,Toyota Motor,5,254694,7.7,16899.3,437575,-12.3,Akio Toyoda,Motor Vehicles and Parts,Motor Vehicles & Parts,8,Japan,"Toyota, Japan",http://www.toyota-global.com,23,364445,157210


In [106]:
f500.loc[f500["previous_rank"] == 0, "previous_rank"] = np.nan

In [107]:
fifth_row = f500.iloc[5]
fifth_row

company                                    Volkswagen
rank                                                6
revenues                                       240264
revenue_change                                    1.5
profits                                        5937.3
assets                                         432116
profit_change                                     NaN
ceo                                   Matthias Muller
industry                     Motor Vehicles and Parts
sector                         Motor Vehicles & Parts
previous_rank                                       7
country                                       Germany
hq_location                        Wolfsburg, Germany
website                     http://www.volkswagen.com
years_on_global_500_list                           23
employees                                      626715
total_stockholder_equity                        97753
Name: 5, dtype: object

In [108]:
company = f500.iloc[:,0]

In [109]:
first_rows = f500.iloc[:3]
f500.iloc[:7]

Unnamed: 0,company,rank,revenues,revenue_change,profits,assets,profit_change,ceo,industry,sector,previous_rank,country,hq_location,website,years_on_global_500_list,employees,total_stockholder_equity
0,Walmart,1,485873,0.8,13643.0,198825,-7.2,C. Douglas McMillon,General Merchandisers,Retailing,1.0,USA,"Bentonville, AR",http://www.walmart.com,23,2300000,77798
1,State Grid,2,315199,-4.4,9571.3,489838,-6.2,Kou Wei,Utilities,Energy,2.0,China,"Beijing, China",http://www.sgcc.com.cn,17,926067,209456
2,Sinopec Group,3,267518,-9.1,1257.9,310726,-65.0,Wang Yupu,Petroleum Refining,Energy,4.0,China,"Beijing, China",http://www.sinopec.com,19,713288,106523
3,China National Petroleum,4,262573,-12.3,1867.5,585619,-73.7,Zhang Jianhua,Petroleum Refining,Energy,3.0,China,"Beijing, China",http://www.cnpc.com.cn,17,1512048,301893
4,Toyota Motor,5,254694,7.7,16899.3,437575,-12.3,Akio Toyoda,Motor Vehicles and Parts,Motor Vehicles & Parts,8.0,Japan,"Toyota, Japan",http://www.toyota-global.com,23,364445,157210
5,Volkswagen,6,240264,1.5,5937.3,432116,,Matthias Muller,Motor Vehicles and Parts,Motor Vehicles & Parts,7.0,Germany,"Wolfsburg, Germany",http://www.volkswagen.com,23,626715,97753
6,Royal Dutch Shell,7,240033,-11.8,4575.0,411275,135.9,Ben van Beurden,Petroleum Refining,Energy,5.0,Netherlands,"The Hague, Netherlands",http://www.shell.com,23,89000,186646


**try here**

In [110]:
# data cleaning basics

laptops = pd.read_csv("laptops.csv", encoding = "Latin-1")

In [111]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
Manufacturer                1303 non-null object
Model Name                  1303 non-null object
Category                    1303 non-null object
Screen Size                 1303 non-null object
Screen                      1303 non-null object
CPU                         1303 non-null object
RAM                         1303 non-null object
 Storage                    1303 non-null object
GPU                         1303 non-null object
Operating System            1303 non-null object
Operating System Version    1133 non-null object
Weight                      1303 non-null object
Price (Euros)               1303 non-null object
dtypes: object(13)
memory usage: 132.5+ KB


In [112]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [113]:
# clean columns names 
new_column_names = []

# string.strip() method removes the white spaces from beginning and end of a string
for name in laptops.columns:
    cleaned_name = name.strip()
    new_column_names.append(cleaned_name)
    
# clean names
laptops.columns = new_column_names
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [114]:
# string.strip() method example
print("   eslam hosam  ".strip())

eslam hosam


In [115]:
new_names = []

def clean_column(column_name):
    column_name = column_name.lower()
    column_name = column_name.strip()
    column_name = column_name.replace(" ", "_")
    column_name = column_name.replace(")", "")
    column_name = column_name.replace("(", "")
    column_name = column_name.replace("operating_system", "os")
    return column_name
    

for name in laptops.columns :
    name = clean_column(name)
    new_names.append(name)

laptops.columns = new_names
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [116]:
laptops.screen_size.unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [117]:
laptops.ram.unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)

In [118]:
print(laptops.ram.dtype)
print(laptops.ram.unique())

laptops.ram = laptops["ram"].str.replace("GB", "")
laptops.ram.astype(int)

object
['8GB' '16GB' '4GB' '2GB' '12GB' '6GB' '32GB' '24GB' '64GB']


0        8
1        8
2        8
3       16
4        8
        ..
1298     4
1299    16
1300     2
1301     6
1302     4
Name: ram, Length: 1303, dtype: int32

In [119]:
# changing screen size
print(laptops.screen_size.dtype)
print(laptops["screen_size"].unique())
laptops.screen_size = laptops.screen_size.str.replace('"' , "")
laptops["screen_size"].astype(float)

object
['13.3"' '15.6"' '15.4"' '14.0"' '12.0"' '11.6"' '17.3"' '10.1"' '13.5"'
 '12.5"' '13.0"' '18.4"' '13.9"' '12.3"' '17.0"' '15.0"' '14.1"' '11.3"']


0       13.3
1       13.3
2       15.6
3       15.4
4       13.3
        ... 
1298    14.0
1299    13.3
1300    14.0
1301    15.6
1302    15.6
Name: screen_size, Length: 1303, dtype: float64

In [120]:
# renaming a column
# Dataframe.rename()

laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [121]:
laptops.rename({"cpu" : "cpu"}, axis = "columns", inplace = True)
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [122]:
# extract values from dtring
print(laptops.gpu.head(2), "\n")

gpu_manufacturer = []

for lst in laptops.gpu.str.split():
    m = lst[0]
    if m not in gpu_manufacturer:
        gpu_manufacturer.append(m)
        
print("we have 4 companies manufacturing gpu for these laptops\n" , gpu_manufacturer)

0    Intel Iris Plus Graphics 640
1          Intel HD Graphics 6000
Name: gpu, dtype: object 

we have 4 companies manufacturing gpu for these laptops
 ['Intel', 'AMD', 'Nvidia', 'ARM']


In [123]:
# how many laptops have intel gpu and Amd and others

Intel_laptops  = []
Amd_laptops = []
others = []

for lst in laptops.gpu.str.split():
    if "Intel" in lst :
        Intel_laptops.append(lst)
        
    elif "AMD" in lst:
        Amd_laptops.append(lst)
    else:
        others.append(lst)
        
print("Intel has {} laptops ".format(len(Intel_laptops)))
print("AMD has {} laptops".format(len(Amd_laptops)))
print("others has {}".format(len(others)))

Intel has 722 laptops 
AMD has 180 laptops
others has 401


In [124]:
# create a new column for the gpu_manufacturer

v1 = laptops.gpu.str.split().str[0]
v1

0       Intel
1       Intel
2       Intel
3         AMD
4       Intel
        ...  
1298    Intel
1299    Intel
1300    Intel
1301      AMD
1302    Intel
Name: gpu, Length: 1303, dtype: object

In [125]:
laptops.gpu.str.split().str[1]

0         Iris
1           HD
2           HD
3       Radeon
4         Iris
         ...  
1298        HD
1299        HD
1300        HD
1301    Radeon
1302        HD
Name: gpu, Length: 1303, dtype: object

In [126]:
laptops.gpu.str.split().str[3]

0       Graphics
1           6000
2            620
3            455
4       Graphics
          ...   
1298         520
1299         520
1300         NaN
1301        M330
1302         NaN
Name: gpu, Length: 1303, dtype: object

In [127]:
laptops.gpu.str.split().str[0].isin (["Intel"]).value_counts()

True     722
False    581
Name: gpu, dtype: int64

In [128]:
# create a new column for cpu manufacturers
cpu_manu = laptops["cpu"].str.split().str[0]

laptops["cpu_manuf"] = cpu_manu
laptops.cpu_manuf.value_counts()

Intel      1240
AMD          62
Samsung       1
Name: cpu_manuf, dtype: int64

In [129]:
# create a gpu manffacturer column
gpu_manufacturer = laptops["gpu"].str.split().str[0]

laptops["gpu_manuf"] = gpu_manufacturer
laptops.gpu_manuf.value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manuf, dtype: int64

In [130]:
laptops.os.value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [131]:
s = pd.Series(["paisr", "oranj", "anana", "oranje"])
print(s)

0     paisr
1     oranj
2     anana
3    oranje
dtype: object


In [132]:
dic = {"paisr" : "bears" , 
       "oranje" : "orange", 
       "anana" : "banana",
       }


s = s.map(dic)

In [133]:
laptops.os.value_counts(dropna = False)

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [134]:
os_dic = {
    "No OS" : "No OS",
    "Linux" : "Linux",
    "Chrome OS" : "Chrome OS",
    "Android" : "Android", 
    "Mac OS" : "Mac OS",
    "macos" : "Mac OS"
}

laptops.os = laptops.os.map(os_dic)
laptops.os.value_counts(dropna = False)

NaN          1138
No OS          66
Linux          62
Chrome OS      27
Mac OS          8
Android         2
Name: os, dtype: int64

In [135]:
os_dic = {
    np.nan : "Windows",
    "No OS" : "No OS",
    "Linux" : "Linux",
    "Chrome OS" : "Chrome OS",
    "Android" : "Android", 
    "Mac OS" : "Mac OS",
    "macos" : "Mac OS"
}

laptops.os = laptops.os.map(os_dic)
laptops.os.value_counts(dropna = False)

Windows      1138
No OS          66
Linux          62
Chrome OS      27
Mac OS          8
Android         2
Name: os, dtype: int64

In [136]:
laptops.isnull().sum(axis = "index")

manufacturer      0
model_name        0
category          0
screen_size       0
screen            0
cpu               0
ram               0
storage           0
gpu               0
os                0
os_version      170
weight            0
price_euros       0
cpu_manuf         0
gpu_manuf         0
dtype: int64

In [137]:
laptops.os_version.value_counts(dropna = False)

10      1072
NaN      170
7         45
10 S       8
X          8
Name: os_version, dtype: int64

In [138]:
# how to fill the missing values in pandas

laptops.loc[laptops.os_version.isnull() , "os"].value_counts(dropna = False)

No OS        66
Linux        62
Chrome OS    27
Windows      13
Android       2
Name: os, dtype: int64

In [139]:
laptops.os_version.value_counts(dropna = False)

10      1072
NaN      170
7         45
10 S       8
X          8
Name: os_version, dtype: int64

In [140]:
laptops.loc[(laptops["os_version"].isnull() & (laptops["os"] == "Windows")),"os_version"] = "win 10"

laptops.loc[laptops.os_version == "win 10" , "os"].value_counts()

Windows    13
Name: os, dtype: int64

In [154]:
# clean string column

print(laptops.weight.dtype)

laptops.weight = laptops["weight"].str.split("kg").str[0].astype(float)
laptops.weight.dtype

object


dtype('float64')

In [156]:
laptops = laptops.rename(columns = {"weight" : "weight_kg"})
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight_kg',
       'price_euros', 'cpu_manuf', 'gpu_manuf'],
      dtype='object')

In [167]:
laptops.price_euros



0       1339.69
1        898.94
2        575.00
3       2537.45
4       1803.60
         ...   
1298     638.00
1299    1499.00
1300     229.00
1301     764.00
1302     369.00
Name: price_euros, Length: 1303, dtype: float64

In [179]:
# how many core i7 laptops in the data set

(laptops.cpu.str.split().str[2] == "i5").value_counts()

False    880
True     423
Name: cpu, dtype: int64