In [1]:
#导入package
import pandas as pd
import numpy as np

In [5]:
#读取数据
data = pd.read_csv("poj_question_text.csv")

In [6]:
data.head()

Unnamed: 0,q_index,q_text
0,1000,Calculate a+b \n
1,1001,Problems involving the computation of exact va...
2,1002,Businesses like to have memorable telephone nu...
3,1003,How far can you make a stack of cards overhang...
4,1004,Larry graduated this year and finally has a jo...


# 数据清洗

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3034 entries, 0 to 3033
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   q_index  3034 non-null   int64 
 1   q_text   2998 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.5+ KB


## 去除空值

In [16]:
data = data.dropna(how="any")
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2998 entries, 0 to 3033
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   q_index  2998 non-null   int64 
 1   q_text   2998 non-null   object
dtypes: int64(1), object(1)
memory usage: 70.3+ KB


## 中文/乱码处理

In [19]:
def check_cn(check_str):
      for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
      return False

In [38]:
#取出带有中文的数据（这里的中文大部分都是乱码）
data_zh = data[data['q_text'].map(lambda x:check_cn(x))]

In [39]:
data_zh.info() #可以看到中文的有50个

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 61 to 2986
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   q_index  50 non-null     int64 
 1   q_text   50 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.2+ KB


In [54]:
#选出英文数据
data_en = data[data['q_text'].map(lambda x:not check_cn(x))]
data_en.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2948 entries, 0 to 3033
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   q_index  2948 non-null   int64 
 1   q_text   2948 non-null   object
dtypes: int64(1), object(1)
memory usage: 69.1+ KB


## 观察文字长度信息

In [88]:
#去掉英文数据中的回车符号
data_en['q_text'] = data_en['q_text'].apply(lambda x:x.strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_en['q_text'] = data_en['q_text'].apply(lambda x:x.strip())


In [90]:
word_length = data_en['q_text'].map(lambda x:len(x.split()))

In [91]:
count_data = word_length>510

In [92]:
count_data.value_counts() # 大于510的有116个，比较少，选择去除这些数据

False    2832
True      116
Name: q_text, dtype: int64

In [96]:
#去除大于510的文本数据
data_en = data_en[data_en['q_text'].map(lambda x:len(x.split())<510)]

In [97]:
data_en.info() # 现在就剩下2832个数据

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2832 entries, 0 to 3033
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   q_index  2832 non-null   int64 
 1   q_text   2832 non-null   object
dtypes: int64(1), object(1)
memory usage: 66.4+ KB


In [98]:
data_en

Unnamed: 0,q_index,q_text
0,1000,Calculate a+b
1,1001,Problems involving the computation of exact va...
2,1002,Businesses like to have memorable telephone nu...
3,1003,How far can you make a stack of cards overhang...
4,1004,Larry graduated this year and finally has a jo...
...,...,...
3002,4022,Problems
3010,4031,There is a robot who lives on a cartesian plan...
3013,4034,Problems
3023,4044,Problems


In [100]:
data_en.to_csv("post_poj_text.csv",index=False)

# 处理JUNYI数据

In [61]:
junyi = pd.read_csv("junyi_question_text.txt",sep='#') #由于文本中有颜色属性也是用#号表示的

In [62]:
junyi = junyi.loc[:,['question_name','chinese_question']]

In [63]:
junyi['q_index'] = [i for i in range(1,841)] #加入索引列，题目序号从1开始

In [64]:
junyi.head()

Unnamed: 0,question_name,chinese_question,q_index
0,parabola_intuition_1,Adjust the leadership factor and the coefficie...,1
1,circles_and_arcs,TIMEOUT_ISSUE,2
2,inscribed_angles_3,"If the orange angle is 70 degrees, then I ask ...",3
3,solving_quadratics_by_factoring,Excuse me f (x) f (x) and x Where x-axis inter...,4
4,graphing_parabolas_1,Draw the following equation:,5


In [65]:
junyi.info() #可以看到是有空值存在的，所以要去掉空值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   question_name     840 non-null    object
 1   chinese_question  831 non-null    object
 2   q_index           840 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 19.8+ KB


## 去除TIMEOUT_ISSUE和NAN数据

In [66]:
#去除空值
junyi.dropna(how="any",inplace=True)

In [67]:
junyi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 831 entries, 0 to 839
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   question_name     831 non-null    object
 1   chinese_question  831 non-null    object
 2   q_index           831 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 26.0+ KB


In [68]:
# 去除 TIMEOUT_ISSUE and LINK_ISSUE
junyi = junyi[junyi['chinese_question'].map(lambda x:'TIMEOUT_ISSUE' not in x)]
junyi = junyi[junyi['chinese_question'].map(lambda x:'LINK_ISSUE' not in x)]

In [69]:
junyi

Unnamed: 0,question_name,chinese_question,q_index
0,parabola_intuition_1,Adjust the leadership factor and the coefficie...,1
2,inscribed_angles_3,"If the orange angle is 70 degrees, then I ask ...",3
3,solving_quadratics_by_factoring,Excuse me f (x) f (x) and x Where x-axis inter...,4
4,graphing_parabolas_1,Draw the following equation:,5
9,angles_2,Given the following conditions:,10
...,...,...,...
826,vertex_of_a_parabola,There is a following equation:,827
827,logarithms_1,\log_{3}243 = {?} log 3 243=?,828
829,proportions_inner_outer_product,A known (X-1) :( x + 8) = 8: 6 (X-1) :( x + 8)...,830
835,matrix_basic_distance,Highway mileage data traffic between several c...,836


In [70]:
junyi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 0 to 836
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   question_name     325 non-null    object
 1   chinese_question  325 non-null    object
 2   q_index           325 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 10.2+ KB


## 去除回车符号

In [71]:
junyi['chinese_question'] = junyi['chinese_question'].apply(lambda x:x.strip())

## 观察长度信息

In [72]:
length_info = junyi['chinese_question'].map(lambda x:len(x.split(" ")) > 510)

In [73]:
length_info.value_counts() # 说明长度还可以，不需要做过多处理

False    325
Name: chinese_question, dtype: int64

In [76]:
# 保存
junyi.to_csv("post_junyi_text.csv",index=False)

## 确认下name是否有重复的

In [82]:
check = junyi[junyi['question_name'].duplicated()]

In [83]:
 check #419 和 629 应该要删掉

Unnamed: 0,question_name,chinese_question,q_index
419,question_name,chinese_question,420
629,question_name,chinese_question,630


In [87]:
junyi.drop([419,629],axis=0,inplace=True)

In [88]:
check = junyi[junyi['question_name'].duplicated()]

In [89]:
check

Unnamed: 0,question_name,chinese_question,q_index


In [90]:
junyi.to_csv("post_junyi_text.csv",index=False)