# 数据分析入门

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display  # display 函数可以更灵活地显示各种对象，比如 DataFrame、图片等，相比直接打印能提供更美观的输出格式。
pd.options.display.max_columns = 50

## 规划数据分析路线

In [2]:
college = pd.read_csv('college.csv')
college.head()

Unnamed: 0,instnm,city,stabbr,hbcu,menonly,womenonly,relaffil,satvrmid,satmtmid,distanceonly,ugds,ugds_white,ugds_black,ugds_hisp,ugds_asian,ugds_aian,ugds_nhpi,ugds_2mor,ugds_nra,ugds_unkn,pptug_ef,curroper,pctpell,pctfloan,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [3]:
# 数据的行数与列数
college.shape

(7535, 27)

In [4]:
# 统计数值列，并进行转置
with pd.option_context('display.max_rows', 8):  # 最多显示 8 行数据（该设置只在 with 代码块内生效，退出后自动恢复原设置）
    display(college.describe(include=[np.number]).T)  # display() 函数输出美化后的表格格式

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0000,0.00000,0.000000,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0000,0.00000,0.000000,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0000,0.00000,0.000000,1.0
relaffil,7535.0,0.190975,0.393096,0.0,0.0000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...
curroper,7535.0,0.923291,0.266146,0.0,1.0000,1.00000,1.000000,1.0
pctpell,6849.0,0.530643,0.225544,0.0,0.3578,0.52150,0.712900,1.0
pctfloan,6849.0,0.522211,0.283616,0.0,0.3329,0.58330,0.745000,1.0
ug25abv,6718.0,0.410021,0.228939,0.0,0.2415,0.40075,0.572275,1.0


In [5]:
# 未设置最大行数
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
relaffil,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
satvrmid,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
satmtmid,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
distanceonly,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
ugds,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
ugds_white,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
ugds_black,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [6]:
# 统计对象和类型列
'''
count 非空值数量
unique 唯一值数量
top 最常出现的值
freq 最高频值的出现次数
'''
college.describe(include=[object, pd.Categorical]).T  # pd.Categorical：Pandas 的分类数据类型


Unnamed: 0,count,unique,top,freq
instnm,7535,7535,Excel Learning Center-San Antonio South,1
city,7535,2514,New York,87
stabbr,7535,59,CA,773
md_earn_wne_p10,6413,598,PrivacySuppressed,822
grad_debt_mdn_supp,7503,2038,PrivacySuppressed,1510


In [7]:
# 列出每列的数据类型，非缺失值的数量，以及内存的使用
'''
列名 (Column)
非空值数量 (Non-Null Count)
数据类型 (Dtype)
'''
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   instnm              7535 non-null   object 
 1   city                7535 non-null   object 
 2   stabbr              7535 non-null   object 
 3   hbcu                7164 non-null   float64
 4   menonly             7164 non-null   float64
 5   womenonly           7164 non-null   float64
 6   relaffil            7535 non-null   int64  
 7   satvrmid            1185 non-null   float64
 8   satmtmid            1196 non-null   float64
 9   distanceonly        7164 non-null   float64
 10  ugds                6874 non-null   float64
 11  ugds_white          6874 non-null   float64
 12  ugds_black          6874 non-null   float64
 13  ugds_hisp           6874 non-null   float64
 14  ugds_asian          6874 non-null   float64
 15  ugds_aian           6874 non-null   float64
 16  ugds_n

In [8]:
# 在 describe 中打印分位数
with pd.option_context('display.max_rows', 6):
    display(college.describe(include=[np.number], percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T)

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.000000,0.0000,0.00000,0.0000,0.00000,0.000000,0.00000,0.00000,1.000000,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.000000,0.0000,0.00000,0.0000,0.00000,0.000000,0.00000,0.00000,0.000000,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.000000,0.0000,0.00000,0.0000,0.00000,0.000000,0.00000,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pctpell,6849.0,0.530643,0.225544,0.0,0.054976,0.1700,0.23656,0.3578,0.52150,0.712900,0.83330,0.89636,0.993908,1.0
pctfloan,6849.0,0.522211,0.283616,0.0,0.000000,0.0000,0.00000,0.3329,0.58330,0.745000,0.84752,0.89792,0.986368,1.0
ug25abv,6718.0,0.410021,0.228939,0.0,0.002500,0.0374,0.08990,0.2415,0.40075,0.572275,0.72666,0.80000,0.917383,1.0


In [9]:
# 导入一个数据字典.csv：数据字典的主要作用是解释列名的意义
college_dd =pd.read_csv('college_data_dictionary.csv')
with pd.option_context('display.max_rows', 8):
    display(college_dd)

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
...,...,...
23,PCTFLOAN,Percent Students with federal loan
24,UG25ABV,Percent Students Older than 25
25,MD_EARN_WNE_P10,Median Earnings 10 years after enrollment
26,GRAD_DEBT_MDN_SUPP,Median debt of completers


In [10]:
college.columns

Index(['instnm', 'city', 'stabbr', 'hbcu', 'menonly', 'womenonly', 'relaffil',
       'satvrmid', 'satmtmid', 'distanceonly', 'ugds', 'ugds_white',
       'ugds_black', 'ugds_hisp', 'ugds_asian', 'ugds_aian', 'ugds_nhpi',
       'ugds_2mor', 'ugds_nra', 'ugds_unkn', 'pptug_ef', 'curroper', 'pctpell',
       'pctfloan', 'ug25abv', 'md_earn_wne_p10', 'grad_debt_mdn_supp'],
      dtype='object')

## 改变数据类型， 降低内存消耗

In [11]:
# 随机选取5列
college = pd.read_csv('college.csv')

different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']
different_cols = [col.lower() for col in different_cols]
col2 = college.loc[:, different_cols]
col2.head()

Unnamed: 0,relaffil,satmtmid,curroper,instnm,stabbr
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [12]:
# 查看数据类型
col2.dtypes

relaffil      int64
satmtmid    float64
curroper      int64
instnm       object
stabbr       object
dtype: object

In [13]:
# ⽤memory_usage方法查看每列的内存消耗 
original_mem = col2.memory_usage(deep=True)
'''
deep=True 参数
    - 启用深度计算模式，会实际检查对象内容（如字符串的真实长度）
    - 不加该参数只会计算表面内存占用（不推荐）
'''
original_mem

Index          132
relaffil     60280
satmtmid     60280
curroper     60280
instnm      599848
stabbr      384285
dtype: int64

In [14]:
# 使用 astype 方法转换数据类型
col2['relaffil'] = col2['relaffil'].astype(np.int8)

col2.dtypes

relaffil       int8
satmtmid    float64
curroper      int64
instnm       object
stabbr       object
dtype: object

In [15]:
# 检查两个对象列的独立值的个数
'''
nunique()
计算每列中不同唯一值的数量
自动忽略缺失值（NaN 不参与计数）
'''
col2.select_dtypes(include=[object]).nunique()


instnm    7535
stabbr      59
dtype: int64

In [16]:
# stabbr 列可以转变为‘类型’（Categorical），独立值的个数小于总数的1%
col2['stabbr'] = col2['stabbr'].astype('category')  # category：分类类型
'''
分类类型（category）的特点
1. 内存优化
    - 存储的不是原始值，而是基于整数编码的类别索引
    - 特别适合重复值多的列（如性别、省份、等级等）
2. 性能提升
    - 对分类列的操作（如 groupby、排序）比字符串类型更快
    - 统计运算时会自动忽略未使用的类别
3. 有序性支持
    - 可以定义类别顺序（如：小<中<大）
'''
col2.dtypes

relaffil        int8
satmtmid     float64
curroper       int64
instnm        object
stabbr      category
dtype: object

In [17]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          132
relaffil      7535
satmtmid     60280
curroper     60280
instnm      600307
stabbr       12648
dtype: int64

In [18]:
new_mem / original_mem

Index       1.000000
relaffil    0.125000
satmtmid    1.000000
curroper    1.000000
instnm      1.000765
stabbr      0.032913
dtype: float64

In [19]:
college[['curroper', 'instnm']].memory_usage(deep=True)

Index          132
curroper     60280
instnm      600307
dtype: int64

In [20]:
college.loc[0, 'curroper'], college.loc[0, 'instnm']

(np.int64(1), 'Alabama A & M University')

In [21]:
college.loc[0, 'curroper'] = 10000000  # curroper 列加上了 10000000，但内存使用没有变化
'''
- 数值列（如 int64/float64）在创建时就会预分配固定大小的内存空间
- 每个数值占用固定字节数（如 int64 固定8字节），修改值不会改变内存占用
- 类似数组的连续存储结构，修改元素值不影响整体内存布局
'''
college.loc[0, 'instnm'] = college.loc[0, 'instnm'] + 'a'  # instnm 列加上了一个a，内存消耗增加1字节
# college.loc[1, 'instnm'] = college.loc[1, 'instnm'] + 'a'
'''
- 字符串列以 object 类型存储，实际保存的是 Python 对象的指针
- 每次修改都会创建新字符串对象（Python 字符串不可变）
- 内存按实际字符串长度动态分配，增加1个字符就多占1字节
- Pandas 不会为字符串预分配缓冲区，每个元素独立分配内存
'''

college[['curroper', 'instnm']].memory_usage(deep=True)

Index          132
curroper     60280
instnm      600308
dtype: int64

In [22]:
college['MENONLY'.lower()].dtype

dtype('float64')

In [23]:
college['MENONLY'.lower()]  # 数据字典中的信息显示 menonly 这列只包含 0 和 1，但是由于含有缺失值，它的类型是浮点型

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7530    NaN
7531    NaN
7532    NaN
7533    NaN
7534    NaN
Name: menonly, Length: 7535, dtype: float64

In [24]:
college['MENONLY'.lower()].astype('int8')  # 有缺失值转化为整型会报错


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [25]:
college.describe(include=['int64', 'float64']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
relaffil,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
satvrmid,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
satmtmid,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
distanceonly,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
ugds,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
ugds_white,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
ugds_black,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [26]:
college.describe(include=[np.int64, np.float64]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
relaffil,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
satvrmid,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
satmtmid,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
distanceonly,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
ugds,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
ugds_white,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
ugds_black,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [27]:
college['RELAFFIL'.lower()] = college['RELAFFIL'.lower()].astype(np.int8)
college.describe(include=['int', 'float']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
satvrmid,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
satmtmid,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
distanceonly,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
ugds,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
ugds_white,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
ugds_black,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0
ugds_hisp,6874.0,0.161635,0.221854,0.0,0.0276,0.0714,0.198875,1.0


In [28]:
college.describe(include=['number']).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hbcu,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
menonly,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
womenonly,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
relaffil,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
satvrmid,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
satmtmid,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
distanceonly,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
ugds,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
ugds_white,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
ugds_black,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [29]:
# 转变数据类型时也可以如法炮制
college['MENONLY'.lower()] = college['MENONLY'.lower()].astype('float16')
college['RELAFFIL'.lower()] = college['RELAFFIL'.lower()].astype('int8')
college.index = pd.Index(college.index, dtype='int64')
college.index.memory_usage()

132

## 从最大中选择最小

In [30]:
import pandas as pd
import numpy as np

In [31]:
movie = pd.read_csv('movie.csv')
movie2 = movie[['title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [32]:
# 用 nlargest 方法选出 imdb_score 最大的100个数
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [33]:
# 用链式操作，nsmallest 方法在从中挑选 budget 最小的五个数
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')


Unnamed: 0,title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


## 通过排序选取每组最大值

In [34]:
movie3 = movie[['title', 'year', 'imdb_score']]
movie3.sort_values('year', ascending=False).head()

Unnamed: 0,title,year,imdb_score
2211,Nerve,2016.0,7.1
2083,"Hail, Caesar!",2016.0,6.4
73,Suicide Squad,2016.0,6.9
4355,The Dog Lover,2016.0,4.8
2077,Our Kind of Traitor,2016.0,6.4


In [35]:
# 用列表同时对两列进行排序
movie4 = movie3.sort_values(by=['year', 'imdb_score'], ascending=False)
movie4.head()

Unnamed: 0,title,year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2


In [36]:
# 用drop_duplicates去重，只保留每年的第一条数据
movie_top_year = movie3.drop_duplicates(subset=['year'])
movie_top_year.head()

Unnamed: 0,title,year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's End,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens,,7.1


In [37]:
# 通过给 ascending 设置列表，可以同时对一列降序排列，一列升序排列
movie5 = movie[['title', 'year', 'content_rating', 'budget']]
movie5_sorted = movie5.sort_values(by=['year', 'content_rating', 'budget'], ascending=[False, False, True])
movie5_sorted.drop_duplicates(subset=['year', 'content_rating']).head(10)  # 对排序后的数据按年份和内容分级组合去重

Unnamed: 0,title,year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to the Finish,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
3252,The Wailing,2016.0,Not Rated,
4659,Alleluia! The Devil's Carnival,2016.0,,500000.0
4731,Bizarre,2015.0,Unrated,500000.0
812,The Ridiculous 6,2015.0,TV-14,
4831,The Gallows,2015.0,R,100000.0
4825,Romantic Schemer,2015.0,PG-13,125000.0
3796,R.L. Stine's Monsterville: The Cabinet of Souls,2015.0,PG,4400000.0


## 用 sort_values 复现 nlargest 方法

In [38]:
movie_smalllest_largest = movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')
movie_smalllest_largest

Unnamed: 0,title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [39]:
# 用 sort_values 方法，选取 imdb_score 最高的100个
movie2.sort_values('imdb_score', ascending=False).head(100).head()
# 第一个.head(100)是真正限制结果数量
# 第二个.head()默认取前5条，只是为了快速查看样本
# 这种写法虽然能达到目的，但更简洁的写法应该是直接.head(5)，除非确实需要中间100条的结果。


Unnamed: 0,title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
4312,Kickboxer: Vengeance,9.1,17000000.0
2779,Dekalog,9.1,


In [40]:
movie2.sort_values('imdb_score', ascending=False).head(100).sort_values('budget').head()

Unnamed: 0,title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0


In [41]:
# tail 可以查看尾部
movie2.nlargest(100, 'imdb_score').tail()

Unnamed: 0,title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill a Mockingbird,8.4,2000000.0
4395,Reservoir Dogs,8.4,1200000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


In [42]:
movie2.sort_values(by='imdb_score', ascending=False).head(100).tail()

Unnamed: 0,title,imdb_score,budget
2646,U2 3D,8.4,
4815,A Charlie Brown Christmas,8.4,150000.0
3902,M*A*S*H,8.4,
2922,Das Boot,8.4,14000000.0
2605,Lawrence of Arabia,8.4,15000000.0
