## 1. Import libraries ##

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## 2. Import train dataset ##

In [2]:
dataframe = pd.read_csv('small_train.csv')
df = dataframe.head(100000)
print(df.shape)
print('\n-----------------------------------------\n')
print(df.info())
print('\n-----------------------------------------')
df.head()

(100000, 4)

-----------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ID        100000 non-null  int64 
 1   keywords  88344 non-null   object
 2   age       100000 non-null  int64 
 3   sex       100000 non-null  object
dtypes: int64(2), object(2)
memory usage: 3.1+ MB
None

-----------------------------------------


Unnamed: 0,ID,keywords,age,sex
0,696449,,50,M
1,361410,forum:3;contrat:1;calcul:3;conges:1;mission:4;...,47,M
2,211450,villa:1;location:2;aquitaine:2;maison:1;vacanc...,61,F
3,1368807,trafic:1;tournante:1;drogue:1;france:1;plaque:...,45,M
4,3502570,trafic:1;septembre:1;greve:1;sncf:1;sortir:1;p...,22,M


## 3. Clean train dataset ##

In [3]:
# Drop NA

df.dropna(inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


Unnamed: 0,ID,keywords,age,sex
1,361410,forum:3;contrat:1;calcul:3;conges:1;mission:4;...,47,M
2,211450,villa:1;location:2;aquitaine:2;maison:1;vacanc...,61,F
3,1368807,trafic:1;tournante:1;drogue:1;france:1;plaque:...,45,M
4,3502570,trafic:1;septembre:1;greve:1;sncf:1;sortir:1;p...,22,M
5,2027488,darmanin:1;pour:1;ferme:1;jcms:1;conjoncture:1...,55,M


In [4]:
# Encode sex

df['sex'] = pd.get_dummies(df['sex'], drop_first=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'] = pd.get_dummies(df['sex'], drop_first=True)


Unnamed: 0,ID,keywords,age,sex
1,361410,forum:3;contrat:1;calcul:3;conges:1;mission:4;...,47,1
2,211450,villa:1;location:2;aquitaine:2;maison:1;vacanc...,61,0
3,1368807,trafic:1;tournante:1;drogue:1;france:1;plaque:...,45,1
4,3502570,trafic:1;septembre:1;greve:1;sncf:1;sortir:1;p...,22,1
5,2027488,darmanin:1;pour:1;ferme:1;jcms:1;conjoncture:1...,55,1


In [5]:
stop_words = stopwords.words()
porter = PorterStemmer()
def cleantext(counttext):
    wordcounts=counttext.split(";")
    text=""
    for words in wordcounts:
        wordocr=words.split(":")
        if len(wordocr)>1 and not wordocr[0] in stop_words :
            wordocr[0]=porter.stem(wordocr[0])
            text+=(wordocr[0]+" ")*int(wordocr[1])
    return text

In [6]:
df = df[df["keywords"].str.contains(":")]

In [7]:
df["keywords"]=df["keywords"].map(lambda x: cleantext(x))

In [8]:
df.dropna(inplace=True)

In [9]:
df.head()

Unnamed: 0,ID,keywords,age,sex
1,361410,forum forum forum contrat calcul calcul calcul...,47,1
2,211450,villa locat locat aquitain aquitain maison vac...,61,0
3,1368807,trafic tournant drogu franc plaqu actualit,45,1
4,3502570,trafic septembr greve sncf sortir perturb maga...,22,1
5,2027488,darmanin ferm jcm conjonctur guichet p1_169806...,55,1


## 4. Split data (test & train) ##

In [10]:
predictionsex = dict()
predictionage = dict()

In [11]:
X_train, X_test, y_train, y_test, z_train, z_test = train_test_split(df["keywords"], df['sex'], df['age'], test_size = 0.2, random_state = 123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(z_train.shape)
print(z_test.shape)

(70675,)
(17669,)
(70675,)
(17669,)
(70675,)
(17669,)


In [12]:
vc = TfidfVectorizer()
vc.fit(X_train.apply(lambda x: np.str_(x)))
vc.fit(X_test.apply(lambda x: np.str_(x)))

TfidfVectorizer()

In [13]:
print("Vocabulary size: {}".format(len(vc.vocabulary_)))

Vocabulary size: 26905


In [14]:
X_train_tf = vc.transform(X_train.apply(lambda x: np.str_(x)))
X_test_tf = vc.transform(X_test.apply(lambda x: np.str_(x)))

## 5. Predict sex##

In [15]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
model_LR = LogisticRegression(max_iter=10000)
model_LR.fit(X_train_tf, y_train)
predictionsex["LR"] = model_LR.predict(X_test_tf)
model_LR.score(X_test_tf, y_test)

0.5977135095364763

In [16]:
# Naive Bayes

# from sklearn.naive_bayes import MultinomialNB
# model_NB = MultinomialNB()
# model_NB.fit(X_train_tf,y_train)
# predictionsex["NB"] = model_NB.predict(X_test_tf)
# model_NB.score(X_test_tf, y_test)

In [17]:
# Random Forest

# from sklearn.ensemble import RandomForestClassifier
# model_RF = RandomForestClassifier(n_estimators=2000,max_depth=8)
# model_RF.fit(X_train_tf, y_train)
# predictionsex["RF"] = model_RF.predict(X_test_tf)
# model_RF.score(X_test_tf, y_test)

## 6. Predict age##

In [None]:
# Random Forest

from sklearn.ensemble import RandomForestRegressor
model_RF = RandomForestRegressor()
model_RF.fit(X_train_tf,z_train)
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
predictionage["RF"] = model_RF.predict(X_test_tf)
print("Mean square error:%.4f"%(mean_squared_error(z_test,predictionage["RF"])))
print("Mean absolute error:%.4f"%(mean_absolute_error(z_test,predictionage["RF"])))
print("R square: ", '{:.2%}'.format(r2_score(z_test,predictionage["RF"])))

In [None]:
# Linear Regression

# from sklearn.linear_model import LinearRegression
# model_linreg = LinearRegression()
# model_linreg.fit(X_train_tf,z_train)
# from sklearn.metrics import mean_squared_error 
# from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import r2_score
# predictionage["linreg"] = model_linreg.predict(X_test_tf)
# print("Mean square error:%.4f"%(mean_squared_error(z_test,predictionage["linreg"])))
# print("Mean absolute error:%.4f"%(mean_absolute_error(z_test,predictionage["linreg"])))
# print("R square: ", '{:.2%}'.format(r2_score(z_test,predictionage["linreg"])))

In [None]:
# Polynomial Regression

# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# poly_reg = PolynomialFeatures(degree = 2)
# X_poly = poly_reg.fit_transform(X_train_tf)
# regressor = LinearRegression()
# regressor.fit(X_poly, z_train)
# z_pred = regressor.predict(poly_reg.transform(X_test_tf))
# np.set_printoptions(precision=2)
# print(np.concatenate((z_pred.reshape(len(z_pred),1), z_test.reshape(len(z_test),1)),1))
# from sklearn.metrics import r2_score
# r2_score(z_test, z_pred)

In [None]:
# Stochastic Gradient Descent

# from sklearn.linear_model import SGDRegressor
# model_sgd = SGDRegressor(alpha=0.00001,max_iter=10000,random_state=42)
# model_sgd.fit(X_train_tf,z_train)
# from sklearn.metrics import mean_squared_error 
# from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import r2_score
# predictionage = dict()
# predictionage["sgd"] = model_sgd.predict(X_test_tf)
# print("Mean square error:%.4f"%(mean_squared_error(z_test,predictionage["sgd"])))
# print("Mean absolute error:%.4f"%(mean_absolute_error(z_test,predictionage["sgd"])))
# print("R square: ", '{:.2%}'.format(r2_score(z_test,predictionage["sgd"])))

## 7. Import test dataset##

In [None]:
dataframe_test = pd.read_csv('small_test.csv')
df_test = dataframe_test.head(10000)

## 8. Clean test dataset##

In [None]:
df_test = df_test[['ID','keywords']]
df_test.dropna(inplace = True)
df_test.head()

In [None]:
df_test = df_test[df_test["keywords"].str.contains(":")]
df_test["keywords"] = df_test["keywords"].map(lambda x: cleantext(x))
df_test.dropna(inplace = True)
df_test.head()

In [None]:
X_test_final = vc.transform(df_test["keywords"].apply(lambda x: np.str_(x)))

## 9. Apply model##

In [None]:
result=pd.DataFrame()
result["ID"]=test["ID"]
result["sex_pred"]=model_LR.predict(X_test_final)

In [None]:
result["sex_pred"]=result["sex_pred"].replace(1, "M")
result["sex_pred"]=result["sex_pred"].replace(0, "F")

In [None]:
result["age_pred"]=model_RF.predict(X_test_final)

In [None]:
result.head()

## 10. Export results##

In [None]:
result.to_csv("final_results.csv",index=False,sep=',')