### DROPPING ROWS AND COLUMNS

+ The .drop() method drops rows and columns from dataframe.
+ `axis = 0 ==> rows` and `axis = 1 ==> columns`

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("transactions.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [4]:
df.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [9]:
df.drop([1,2,3], axis = 0)

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
4,2013-01-02,4,1922
5,2013-01-02,5,1903
6,2013-01-02,6,2143
7,2013-01-02,7,1874
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [10]:
df.drop("date", axis = 1)

Unnamed: 0,store_nbr,transactions
0,25,770
1,1,2111
2,2,2358
3,3,3487
4,4,1922
...,...,...
83483,50,2804
83484,51,1573
83485,52,2255
83486,53,932


### IDENTIFYING DUPLICATE ROWS

+ The .duplicated() method identifies duplicates rows of data.
+ The .drop_duplicates() method will drop the duplicates rows from the dataframe.
+ Specifying the subset as column will drop the duplicates from the column.


In [12]:
data = pd.read_csv("product.csv")
data.duplicated().sum()

0

In [13]:
data.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [19]:
## to find out if there any duplicate rows in a particulr column.
data.duplicated(subset = "DEPARTMENT")

0        False
1        False
2        False
3         True
4         True
         ...  
92348     True
92349     True
92350     True
92351     True
92352     True
Length: 92353, dtype: bool

In [32]:
## in the dataframe there us no duplicates.
print("The duplicates in the Dataframe :--",data.duplicated().sum())
## check the duplicates in the department column/
print("The Duplicates in the Department Column :--", data.duplicated(subset = "DEPARTMENT"))
print("The Duplicates in the Department Column in numbers  :--", data.duplicated(subset = "DEPARTMENT").sum())

print("The Shape of the Data :-", data.shape)
## drop the duplicates.
data.drop_duplicates(subset = "DEPARTMENT", keep = "last", ignore_index=True)
data.shape

The duplicates in the Dataframe :-- 0
The Duplicates in the Department Column :-- 0        False
1        False
2        False
3         True
4         True
         ...  
92348     True
92349     True
92350     True
92351     True
92352     True
Length: 92353, dtype: bool
The Duplicates in the Department Column in numbers  :-- 92309
The Shape of the Data :- (92353, 7)


(92353, 7)

In [49]:
oil_data = pd.read_csv("oil.csv")
print(oil_data.shape)

## append the last row into the dataframe.
oil_data = oil_data.append(oil_data.iloc[-1]).reset_index(drop = True)
print(oil_data.tail())

## check for the duplicates
print(oil_data.duplicated())

## drop the duplicates
print(oil_data.drop_duplicates())

## print the column where dcoilwtico whether it is duplicated.
oil_data.duplicated(subset = "dcoilwtico").sum()
oil_data.drop_duplicates(subset = "dcoilwtico", keep = "last", ignore_index = True)

(1218, 2)
            date  dcoilwtico
1214  2017-08-28       46.40
1215  2017-08-29       46.46
1216  2017-08-30       45.96
1217  2017-08-31       47.26
1218  2017-08-31       47.26
0       False
1       False
2       False
3       False
4       False
        ...  
1214    False
1215    False
1216    False
1217    False
1218     True
Length: 1219, dtype: bool
            date  dcoilwtico
0     2013-01-01         NaN
1     2013-01-02       93.14
2     2013-01-03       92.97
3     2013-01-04       93.12
4     2013-01-07       93.20
...          ...         ...
1213  2017-08-25       47.65
1214  2017-08-28       46.40
1215  2017-08-29       46.46
1216  2017-08-30       45.96
1217  2017-08-31       47.26

[1218 rows x 2 columns]


Unnamed: 0,date,dcoilwtico
0,2013-01-02,93.14
1,2013-01-03,92.97
2,2013-01-07,93.20
3,2013-01-09,93.08
4,2013-01-14,94.27
...,...,...
994,2017-08-25,47.65
995,2017-08-28,46.40
996,2017-08-29,46.46
997,2017-08-30,45.96


ASSIGNMENTS

In [62]:
transactions = pd.read_csv("transactions.csv")
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [63]:
transactions.drop(0, axis = 0, inplace = True)

In [65]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
5,2013-01-02,5,1903


In [68]:
## drop the first column.
transactions.drop("date", axis = 1, inplace = True)
transactions.head()

Unnamed: 0,store_nbr,transactions
1,1,2111
2,2,2358
3,3,3487
4,4,1922
5,5,1903


In [70]:
## keep the last ites in a transaction
transactions.drop_duplicates(subset = "store_nbr", keep = "last")


Unnamed: 0,store_nbr,transactions
83434,1,1693
83435,2,1737
83436,3,2956
83437,4,1283
83438,5,1310
83439,6,1589
83440,7,1780
83441,8,2621
83442,9,2155
83443,10,1010
