#Common Data Analysis Methods
* Notes
* But there are other methods like:

1. df.sum() - Sum of values
2. df.cumsum() - Cummulative sum of values
3. df.min()/df.max() - Minimum/maximum values
4. df.idxmin()/df.idxmax() - Indexes of minimum/Maximum value
5. df.mean() - Mean of values
6. df.median() - Median of values
7. df.mode() - Mode of the values
8. series.value_counts() - Counts of unique values for a Series (aka a column)


---


* Typically used for counting unique occurrences within a single column.
* It's not applicable to an entire DataFrame without specifying a column.
* Depending on the method, you can apply it directly to a DataFrame or a Series.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/data_jobs.csv")

In [None]:
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [None]:
df.describe()

Unnamed: 0,job_posted_date,salary_year_avg,salary_hour_avg
count,6866,238.0,100.0
mean,2023-06-23 06:26:18.448441344,130473.224708,54.3279
min,2023-01-01 13:02:08,34288.574219,9.5
25%,2023-03-17 14:05:57.750000128,96945.375,38.410001
50%,2023-06-21 13:50:29,125000.0,56.545
75%,2023-09-25 13:54:20.750000128,157500.0,66.375
max,2023-12-31 13:32:38,375000.0,115.0
std,,51989.85652,21.027555


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6866 entries, 0 to 6865
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   job_title_short        6866 non-null   object        
 1   job_title              6866 non-null   object        
 2   job_location           6851 non-null   object        
 3   job_via                6865 non-null   object        
 4   job_schedule_type      6777 non-null   object        
 5   job_work_from_home     6866 non-null   bool          
 6   search_location        6866 non-null   object        
 7   job_posted_date        6866 non-null   datetime64[ns]
 8   job_no_degree_mention  6866 non-null   bool          
 9   job_health_insurance   6866 non-null   bool          
 10  job_country            6865 non-null   object        
 11  salary_rate            339 non-null    object        
 12  salary_year_avg        238 non-null    float64       
 13  sal

In [None]:
df['salary_year_avg'].median()

125000.0

In [None]:
df['salary_year_avg'].min()

34288.57421875

In [None]:
min_salary = df['salary_year_avg'].idxmin()        # it's gonna provide index of this minimum salary

In [None]:
[df.iloc[min_salary]]           # to identify that row

[job_title_short                                               Data Analyst
 job_title                                           Citrus Data Analyst II
 job_location                                                   Alachua, FL
 job_via                                                         via Indeed
 job_schedule_type                                                Full-time
 job_work_from_home                                                   False
 search_location                                     Florida, United States
 job_posted_date                                        2023-04-27 13:02:12
 job_no_degree_mention                                                 True
 job_health_insurance                                                  True
 job_country                                                  United States
 salary_rate                                                           year
 salary_year_avg                                               34288.574219
 salary_hour

In [None]:
df['job_title_short'].unique()

array(['Senior Data Engineer', 'Data Analyst', 'Data Engineer',
       'Business Analyst', 'Data Scientist', 'Machine Learning Engineer',
       'Senior Data Analyst', 'Cloud Engineer', 'Senior Data Scientist',
       'Software Engineer'], dtype=object)

In [None]:
[df['job_title_short'].value_counts()]

[job_title_short
 Data Engineer                2080
 Data Analyst                 1508
 Data Scientist               1401
 Senior Data Engineer          504
 Senior Data Scientist         335
 Business Analyst              312
 Software Engineer             312
 Senior Data Analyst           216
 Machine Learning Engineer     119
 Cloud Engineer                 79
 Name: count, dtype: int64]

#groupby

In [None]:
[df.groupby('job_title_short')['salary_year_avg'].min()]

[job_title_short
 Business Analyst              43200.000000
 Cloud Engineer                         NaN
 Data Analyst                  34288.574219
 Data Engineer                 70000.000000
 Data Scientist                50400.000000
 Machine Learning Engineer     56700.000000
 Senior Data Analyst           84039.000000
 Senior Data Engineer          45000.000000
 Senior Data Scientist        100000.000000
 Software Engineer             56700.000000
 Name: salary_year_avg, dtype: float64]

In [None]:
[df.groupby(['job_title_short','job_country'])['salary_year_avg'].median()]

[job_title_short    job_country         
 Business Analyst   Australia                    NaN
                    Austria                      NaN
                    Bangladesh                   NaN
                    Belgium                      NaN
                    Brazil                       NaN
                                              ...   
 Software Engineer  Ukraine                      NaN
                    United Arab Emirates         NaN
                    United Kingdom               NaN
                    United States           275250.0
                    Uruguay                      NaN
 Name: salary_year_avg, Length: 599, dtype: float64]

In [None]:
[df.groupby('job_title_short')[['salary_year_avg','salary_hour_avg']].min()]

[                           salary_year_avg  salary_hour_avg
 job_title_short                                            
 Business Analyst              43200.000000        32.184998
 Cloud Engineer                         NaN              NaN
 Data Analyst                  34288.574219        15.000000
 Data Engineer                 70000.000000        32.500000
 Data Scientist                50400.000000         9.500000
 Machine Learning Engineer     56700.000000              NaN
 Senior Data Analyst           84039.000000        24.969999
 Senior Data Engineer          45000.000000        52.500000
 Senior Data Scientist        100000.000000        17.500000
 Software Engineer             56700.000000        23.450001]

In [None]:
# to do multiple different operation to do

In [None]:
[df.groupby('job_title_short')['salary_year_avg'].agg(['min','max','mean'])]

[                                     min       max           mean
 job_title_short                                                  
 Business Analyst            43200.000000  130000.0   85630.000000
 Cloud Engineer                       NaN       NaN            NaN
 Data Analyst                34288.574219  186500.0   92410.916427
 Data Engineer               70000.000000  300000.0  134139.815136
 Data Scientist              50400.000000  361000.0  143706.336735
 Machine Learning Engineer   56700.000000  325000.0  150340.000000
 Senior Data Analyst         84039.000000  170000.0  111089.833333
 Senior Data Engineer        45000.000000  375000.0  149453.793103
 Senior Data Scientist      100000.000000  187500.0  148472.750000
 Software Engineer           56700.000000  317000.0  169620.000000]